diff --git a/recipes/configs/falcon3/10B_full.yaml b/recipes/configs/falcon3/10B_full.yaml
new file mode 100644
index 0000000000..6bdafb4b22
--- /dev/null
+++ b/recipes/configs/falcon3/10B_full.yaml
@@ -0,0 +1,107 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Falcon3 10B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/10B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/10B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 10B_full.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-10B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_10b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-10B-Base
+  checkpoint_files: [
+    model-00001-of-00005.safetensors,
+    model-00002-of-00005.safetensors,
+    model-00003-of-00005.safetensors,
+    model-00004-of-00005.safetensors,
+    model-00005-of-00005.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-10B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 5e-6
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-10B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/10B_full_single_device.yaml b/recipes/configs/falcon3/10B_full_single_device.yaml
new file mode 100644
index 0000000000..c87ff7f52d
--- /dev/null
+++ b/recipes/configs/falcon3/10B_full_single_device.yaml
@@ -0,0 +1,108 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Falcon3 10B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config falcon3/10B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config falcon3/10B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-10B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_10b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-10B-Base
+  checkpoint_files: [
+    model-00001-of-00005.safetensors,
+    model-00002-of-00005.safetensors,
+    model-00003-of-00005.safetensors,
+    model-00004-of-00005.safetensors,
+    model-00005-of-00005.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-10B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 1
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 5e-6
+optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-10B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/10B_lora.yaml b/recipes/configs/falcon3/10B_lora.yaml
new file mode 100644
index 0000000000..0e342bd03e
--- /dev/null
+++ b/recipes/configs/falcon3/10B_lora.yaml
@@ -0,0 +1,117 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Falcon3 10B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/10B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/10B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 10B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_10b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-10B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-10B-Base
+  checkpoint_files: [
+    model-00001-of-00005.safetensors,
+    model-00002-of-00005.safetensors,
+    model-00003-of-00005.safetensors,
+    model-00004-of-00005.safetensors,
+    model-00005-of-00005.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-10B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+batch_size: 2
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-10B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/10B_lora_single_device.yaml b/recipes/configs/falcon3/10B_lora_single_device.yaml
new file mode 100644
index 0000000000..98e9f0dd76
--- /dev/null
+++ b/recipes/configs/falcon3/10B_lora_single_device.yaml
@@ -0,0 +1,116 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Falcon3 10B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config falcon3/10B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config falcon3/10B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_10b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-10B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-10B-Base
+  checkpoint_files: [
+    model-00001-of-00005.safetensors,
+    model-00002-of-00005.safetensors,
+    model-00003-of-00005.safetensors,
+    model-00004-of-00005.safetensors,
+    model-00005-of-00005.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-10B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-10B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/1B_full.yaml b/recipes/configs/falcon3/1B_full.yaml
new file mode 100644
index 0000000000..a14d1d6c14
--- /dev/null
+++ b/recipes/configs/falcon3/1B_full.yaml
@@ -0,0 +1,103 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Falcon3 1B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-1B-Base --output-dir /tmp/Falcon3-1B --ignore-patterns None
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/1B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/1B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 1B_full.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-1B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_1b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-1B-Base
+  checkpoint_files: [
+    model.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-1B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 5e-6
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-1B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/1B_full_single_device.yaml b/recipes/configs/falcon3/1B_full_single_device.yaml
new file mode 100644
index 0000000000..bd95d54971
--- /dev/null
+++ b/recipes/configs/falcon3/1B_full_single_device.yaml
@@ -0,0 +1,104 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Falcon3 1B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-1B-Base --output-dir /tmp/Falcon3-7B-Base --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config falcon3/1B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config falcon3/1B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-1B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_1b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-1B-Base
+  checkpoint_files: [
+    model.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-1B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 1
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 5e-6
+optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-7B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/1B_lora.yaml b/recipes/configs/falcon3/1B_lora.yaml
new file mode 100644
index 0000000000..14736e8475
--- /dev/null
+++ b/recipes/configs/falcon3/1B_lora.yaml
@@ -0,0 +1,113 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Falcon3 3B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 3B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_3b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-3B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-3B-Base
+  checkpoint_files: [
+    model.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-3B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+batch_size: 2
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-3B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/1B_lora_single_device.yaml b/recipes/configs/falcon3/1B_lora_single_device.yaml
new file mode 100644
index 0000000000..1e5ec0f19f
--- /dev/null
+++ b/recipes/configs/falcon3/1B_lora_single_device.yaml
@@ -0,0 +1,112 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Falcon3 3B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config falcon3/3B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config falcon3/3B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_3b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-3B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-3B-Base
+  checkpoint_files: [
+    model.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-3B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-3B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/3B_full.yaml b/recipes/configs/falcon3/3B_full.yaml
new file mode 100644
index 0000000000..ca32297e7e
--- /dev/null
+++ b/recipes/configs/falcon3/3B_full.yaml
@@ -0,0 +1,104 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Falcon3 3B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/3B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/3B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 3B_full.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-3B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_3b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-3B-Base
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-3B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 5e-6
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-3B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/3B_full_single_device.yaml b/recipes/configs/falcon3/3B_full_single_device.yaml
new file mode 100644
index 0000000000..baf63aade7
--- /dev/null
+++ b/recipes/configs/falcon3/3B_full_single_device.yaml
@@ -0,0 +1,105 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Falcon3 3B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-7B-Base --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config falcon3/3B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config falcon3/3B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-3B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_3b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-3B-Base
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-3B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 1
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 5e-6
+optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-7B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/3B_lora.yaml b/recipes/configs/falcon3/3B_lora.yaml
new file mode 100644
index 0000000000..7b5cddc28e
--- /dev/null
+++ b/recipes/configs/falcon3/3B_lora.yaml
@@ -0,0 +1,114 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Falcon3 3B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 3B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_3b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-3B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-3B-Base
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-3B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+batch_size: 2
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-3B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/3B_lora_single_device.yaml b/recipes/configs/falcon3/3B_lora_single_device.yaml
new file mode 100644
index 0000000000..8c86548a8a
--- /dev/null
+++ b/recipes/configs/falcon3/3B_lora_single_device.yaml
@@ -0,0 +1,113 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Falcon3 3B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config falcon3/3B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config falcon3/3B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_3b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-3B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-3B-Base
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-3B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-3B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/7B_full.yaml b/recipes/configs/falcon3/7B_full.yaml
new file mode 100644
index 0000000000..c2b41d8c33
--- /dev/null
+++ b/recipes/configs/falcon3/7B_full.yaml
@@ -0,0 +1,106 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Falcon3 7B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/7B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/7B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 7B_full.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-7B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_7b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-7B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 5e-6
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-7B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/7B_full_single_device.yaml b/recipes/configs/falcon3/7B_full_single_device.yaml
new file mode 100644
index 0000000000..313ebb46fd
--- /dev/null
+++ b/recipes/configs/falcon3/7B_full_single_device.yaml
@@ -0,0 +1,107 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Falcon3 7B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config falcon3/7B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config falcon3/7B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-7B-Base/tokenizer.json
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_7b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-7B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 1
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 5e-6
+optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-7B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/7B_lora.yaml b/recipes/configs/falcon3/7B_lora.yaml
new file mode 100644
index 0000000000..e8080b5476
--- /dev/null
+++ b/recipes/configs/falcon3/7B_lora.yaml
@@ -0,0 +1,116 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Falcon3 7B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/7B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/7B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_7b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-7B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-7B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+batch_size: 2
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-7B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/7B_lora_single_device.yaml b/recipes/configs/falcon3/7B_lora_single_device.yaml
new file mode 100644
index 0000000000..98b56be550
--- /dev/null
+++ b/recipes/configs/falcon3/7B_lora_single_device.yaml
@@ -0,0 +1,115 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Falcon3 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config falcon3/7B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config falcon3/7B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.lora_falcon3_7b
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8  # higher increases accuracy and memory
+  lora_alpha: 16  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-7B-Base/tokenizer.json
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-7B-Base
+  model_type: FALCON3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False  # True increases speed
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Falcon3-7B-Base-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/falcon3/eleuther_evaluation.yaml b/recipes/configs/falcon3/eleuther_evaluation.yaml
new file mode 100644
index 0000000000..ad3dbc990f
--- /dev/null
+++ b/recipes/configs/falcon3/eleuther_evaluation.yaml
@@ -0,0 +1,41 @@
+# Config for EleutherEvalRecipe in eleuther_eval.py
+#
+# To launch, run the following command from root torchtune directory:
+#    tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"]
+
+# Model Arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_7b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base/
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors
+  ]
+  output_dir: /tmp/Falcon3-7B-Base/
+  model_type: FALCON3
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-7B-Base/tokenizer.json
+  max_seq_len: null
+
+# Environment
+device: cuda
+dtype: bf16
+seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
+
+# EleutherAI specific eval args
+tasks: ["truthfulqa_mc2"]
+limit: null
+max_seq_length: 4096
+batch_size: 8
+enable_kv_cache: True
+
+# Quantization specific args
+quantizer: null
diff --git a/recipes/configs/falcon3/generation.yaml b/recipes/configs/falcon3/generation.yaml
new file mode 100644
index 0000000000..0b6cba28e9
--- /dev/null
+++ b/recipes/configs/falcon3/generation.yaml
@@ -0,0 +1,44 @@
+# Config for running the InferenceRecipe in generate.py to generate output from an LLM
+#
+# To launch, run the following command from root torchtune directory:
+#    tune run generate --config generation
+
+# Model arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_7b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base/
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors
+  ]
+  output_dir: /tmp/Falcon3-7B-Base/
+  model_type: FALCON3
+
+device: cuda
+dtype: bf16
+
+seed: 1234
+
+# Tokenizer arguments
+tokenizer:
+  _component_: torchtune.models.falcon3.falcon3_tokenizer
+  path: /tmp/Falcon3-7B-Base/tokenizer.json
+  max_seq_len: null
+  prompt_template: null
+
+# Generation arguments; defaults taken from gpt-fast
+prompt:
+  system: null
+  user: "Tell me a joke."
+max_new_tokens: 300
+temperature: 0.6 # 0.8 and 0.6 are popular values to try
+top_k: 300
+
+enable_kv_cache: True
+
+quantizer: null
diff --git a/recipes/configs/falcon3/quantization.yaml b/recipes/configs/falcon3/quantization.yaml
new file mode 100644
index 0000000000..d798f31b06
--- /dev/null
+++ b/recipes/configs/falcon3/quantization.yaml
@@ -0,0 +1,30 @@
+# Config for QuantizationRecipe in quantize.py
+#
+# To launch, run the following command from root torchtune directory:
+#    tune run quantize --config quantization
+
+#
+# Model arguments
+model:
+  _component_: torchtune.models.falcon3.falcon3_7b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Falcon3-7B-Base/
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Falcon3-7B-Base/
+  model_type: FALCON3
+
+device: cuda
+dtype: bf16
+seed: 1234
+
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer
+  groupsize: 256
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index faf1ec7124..8cd74c08f4 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -27,6 +27,22 @@ class Recipe:
         name="full_finetune_single_device",
         file_path="full_finetune_single_device.py",
         configs=[
+            Config(
+                name="falcon3/1B_full_single_device",
+                file_path="falcon3/1B_full_single_device.yaml",
+            ),
+            Config(
+                name="falcon3/3B_full_single_device",
+                file_path="falcon3/3B_full_single_device.yaml",
+            ),
+            Config(
+                name="falcon3/7B_full_single_device",
+                file_path="falcon3/7B_full_single_device.yaml",
+            ),
+            Config(
+                name="falcon3/10B_full_single_device",
+                file_path="falcon3/10B_full_single_device.yaml",
+            ),
             Config(
                 name="llama2/7B_full_low_memory",
                 file_path="llama2/7B_full_low_memory.yaml",
@@ -98,6 +114,10 @@ class Recipe:
         name="full_finetune_distributed",
         file_path="full_finetune_distributed.py",
         configs=[
+            Config(name="falcon3/1B_full", file_path="falcon3/1B_full.yaml"),
+            Config(name="falcon3/3B_full", file_path="falcon3/3B_full.yaml"),
+            Config(name="falcon3/7B_full", file_path="falcon3/7B_full.yaml"),
+            Config(name="falcon3/10B_full", file_path="falcon3/10B_full.yaml"),
             Config(name="llama2/7B_full", file_path="llama2/7B_full.yaml"),
             Config(name="llama2/13B_full", file_path="llama2/13B_full.yaml"),
             Config(name="llama3/8B_full", file_path="llama3/8B_full.yaml"),
@@ -136,6 +156,22 @@ class Recipe:
         name="lora_finetune_single_device",
         file_path="lora_finetune_single_device.py",
         configs=[
+            Config(
+                name="falcon3/1B_lora_single_device",
+                file_path="falcon3/1B_lora_single_device.yaml",
+            ),
+            Config(
+                name="falcon3/3B_lora_single_device",
+                file_path="falcon3/3B_lora_single_device.yaml",
+            ),
+            Config(
+                name="falcon3/7B_lora_single_device",
+                file_path="falcon3/7B_lora_single_device.yaml",
+            ),
+            Config(
+                name="falcon3/10B_lora_single_device",
+                file_path="falcon3/10B_lora_single_device.yaml",
+            ),
             Config(
                 name="llama2/7B_lora_single_device",
                 file_path="llama2/7B_lora_single_device.yaml",
@@ -340,6 +376,10 @@ class Recipe:
         name="lora_finetune_distributed",
         file_path="lora_finetune_distributed.py",
         configs=[
+            Config(name="falcon3/1B_lora", file_path="falcon3/1B_lora.yaml"),
+            Config(name="falcon3/3B_lora", file_path="falcon3/3B_lora.yaml"),
+            Config(name="falcon3/7B_lora", file_path="falcon3/7B_lora.yaml"),
+            Config(name="falcon3/10B_lora", file_path="falcon3/10B_lora.yaml"),
             Config(name="llama2/7B_lora", file_path="llama2/7B_lora.yaml"),
             Config(name="llama2/13B_lora", file_path="llama2/13B_lora.yaml"),
             Config(name="llama2/70B_lora", file_path="llama2/70B_lora.yaml"),
@@ -449,6 +489,10 @@ class Recipe:
         file_path="eleuther_eval.py",
         configs=[
             Config(name="eleuther_evaluation", file_path="eleuther_evaluation.yaml"),
+            Config(
+                name="falcon3/evaluation",
+                file_path="falcon3/evaluation.yaml",
+            ),
             Config(
                 name="llama3_2_vision/11B_evaluation",
                 file_path="llama3_2_vision/11B_evaluation.yaml",
diff --git a/torchtune/models/falcon3/__init__.py b/torchtune/models/falcon3/__init__.py
new file mode 100644
index 0000000000..f8cdbdeac7
--- /dev/null
+++ b/torchtune/models/falcon3/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from ._tokenizer import Falcon3Tokenizer
+from ._component_builders import falcon3, lora_falcon3
+from ._convert_weights import falcon3_hf_to_tune, falcon3_tune_to_hf  # noqa
+from ._model_builders import (  # noqa
+    falcon3_tokenizer,
+    falcon3_1b,
+    falcon3_3b,
+    falcon3_7b,
+    falcon3_10b,
+    lora_falcon3_1b,
+    lora_falcon3_3b,
+    lora_falcon3_7b,
+    lora_falcon3_10b
+)
+from ._positional_embeddings import Falcon3RotaryPositionalEmbeddings
+
+__all__ = [
+    "falcon3",
+    "falcon3_1b",
+    "falcon3_3b",
+    "falcon3_7b",
+    "falcon3_10b",
+    "falcon3_hf_to_tune",
+    "falcon3_tune_to_hf",
+    "lora_falcon3",
+    "lora_falcon3_1b",
+    "lora_falcon3_3b",
+    "lora_falcon3_7b",
+    "lora_falcon3_10b",
+    "Falcon3Tokenizer",
+    "falcon3_tokenizer",
+    "Falcon3RotaryPositionalEmbeddings",
+]
+
+
diff --git a/torchtune/models/falcon3/_component_builders.py b/torchtune/models/falcon3/_component_builders.py
new file mode 100644
index 0000000000..21eec709e8
--- /dev/null
+++ b/torchtune/models/falcon3/_component_builders.py
@@ -0,0 +1,447 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+from typing import List
+from torchtune.modules.common_utils import reparametrize_as_dtype_state_dict_post_hook
+
+from torch import nn
+from torchtune.modules.transformer import TransformerDecoder
+from torchtune.models.falcon3._positional_embeddings import Falcon3RotaryPositionalEmbeddings
+
+from torchtune.modules import (
+    MultiHeadAttention,
+    FeedForward,
+    RMSNorm,
+    TransformerSelfAttentionLayer,
+    TiedLinear
+)
+
+
+from torchtune.modules.peft import DoRALinear, LORA_ATTN_MODULES, LoRALinear
+
+"""
+Component builders for the Falcon3 model and popular variants such as LoRA.
+
+torchtune provides composable building blocks. Builder functions help
+stitch these building blocks into higher-level components. This design has
+two benefits:
+- The building blocks themselves are very flexible. For example, ``MultiHeadAttention``
+can take either nn.Linear or nn.LoRALinear for ``q_proj``.
+- Builder functions expose a set of configurable params which keep the constructors of
+the building blocks simple.
+"""
+
+
+def falcon3(
+    vocab_size: int,
+    num_layers: int,
+    num_heads: int,
+    num_kv_heads: int,
+    embed_dim: int,
+    intermediate_dim: int,
+    max_seq_len: int,
+    attn_dropout: float = 0.0,
+    norm_eps: float = 1e-06,
+    rope_base: float = 1_000_042,
+    tie_word_embeddings: bool = False,
+) -> TransformerDecoder:
+    """
+    Build the decoder associated with the Falcon3 model. This includes:
+    - Token embeddings
+    - num_layers number of TransformerSelfAttentionLayer blocks
+    - RMS Norm layer applied to the output of the transformer
+    - Final projection into token space
+
+    Args:
+        vocab_size (int): number of tokens in vocabulary.
+        num_layers (int): number of layers in the transformer decoder.
+        num_heads (int): number of query heads. For MHA this is also the
+            number of heads for key and value
+        num_kv_heads (int): number of key and value heads. User should ensure
+            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
+            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
+        embed_dim (int): embedding dimension for self-attention
+        max_seq_len (int): maximum sequence length the model will be run with, as used
+            by :func:`~torchtune.modules.KVCache`
+        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+            Default: 0.0
+        intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
+
+        norm_eps (float): epsilon in RMS norms.
+        rope_base (float): the base period of the RoPE embeddings.
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 model.
+    """
+    head_dim = embed_dim // num_heads
+    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+
+    rope = Falcon3RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
+    self_attn = MultiHeadAttention(
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
+        k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+        v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+        output_proj=nn.Linear(embed_dim, embed_dim, bias=False),
+        pos_embeddings=rope,
+        kv_cache=None,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+    )
+    mlp = falcon3_mlp(dim=embed_dim, hidden_dim=intermediate_dim)
+    layer = TransformerSelfAttentionLayer(
+        attn=self_attn,
+        mlp=mlp,
+        sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+        mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+    )
+    tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+    if tie_word_embeddings:
+        output_proj = TiedLinear(tok_embeddings)
+    else:
+        output_proj = nn.Linear(embed_dim, vocab_size, bias=False)
+    return TransformerDecoder(
+        tok_embeddings=tok_embeddings,
+        layers=layer,
+        num_layers=num_layers,
+        max_seq_len=max_seq_len,
+        num_heads=num_heads,
+        head_dim=head_dim,
+        norm=RMSNorm(embed_dim, eps=norm_eps),
+        output=output_proj,
+    )
+
+
+def falcon3_mlp(dim: int, hidden_dim: int) -> FeedForward:
+    """
+    Build the MLP layer associated with the Falcon3 model.
+    """
+    gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+    down_proj = nn.Linear(hidden_dim, dim, bias=False)
+    up_proj = nn.Linear(dim, hidden_dim, bias=False)
+    return FeedForward(gate_proj=gate_proj, down_proj=down_proj, up_proj=up_proj)
+
+# ------------------ LoRA Falcon3 ------------------
+
+
+def lora_falcon3(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    *,
+    # falcon3 args
+    vocab_size: int,
+    num_layers: int,
+    num_heads: int,
+    num_kv_heads: int,
+    embed_dim: int,
+    intermediate_dim: int,
+    max_seq_len: int,
+    attn_dropout: float = 0.0,
+    norm_eps: float = 1e-06,
+    rope_base: float = 1_000_042,
+    tie_word_embeddings: bool = False,
+    # LoRA args
+    lora_rank: int,
+    lora_alpha: float,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    # Quantization args
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Return a version of Falcon3 (an instance of :func:`~torchtune.models.falcon3.transformer.Falcon3TransformerDecoder`)
+    with LoRA applied based on the passed in configuration.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        vocab_size (int): number of tokens in vocabulary.
+        num_layers (int): number of layers in the transformer decoder.
+        num_heads (int): number of query heads. For MHA this is also the
+            number of heads for key and value
+        num_kv_heads (int): number of key and value heads. User should ensure
+            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
+            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
+        embed_dim (int): embedding dimension for self-attention
+        max_seq_len (int): maximum sequence length the model will be run with, as used
+            by :func:`~torchtune.modules.KVCache`
+        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+            Default: 0.0
+        intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
+            this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
+        norm_eps (float): epsilon in RMS norms.
+        rope_base (float): the base period of the RoPE embeddings.
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): LoRA dropout probability. Default: 0.0
+        quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
+            weights within linear layers LoRA is applied to. The final output linear projection is not
+            supported for quantization currently.
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 model with LoRA applied to
+        a subset of the attention projections in each layer.
+
+    Raises:
+        ValueError: if ``apply_lora_to_output`` and ``tie_word_embeddings``.
+
+    """
+
+    self_attn = lora_falcon3_self_attention(
+        lora_modules=lora_attn_modules,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+        rope_base=rope_base,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+    if apply_lora_to_mlp:
+        mlp = lora_falcon3_mlp(
+            dim=embed_dim,
+            hidden_dim=intermediate_dim,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            quantize_base=quantize_base,
+            use_dora=use_dora,
+            lora_dropout=lora_dropout,
+        )
+    else:
+        mlp = falcon3_mlp(dim=embed_dim, hidden_dim=intermediate_dim)
+
+    layer = TransformerSelfAttentionLayer(
+        attn=self_attn,
+        mlp=mlp,
+        sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+        mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+    )
+
+    tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+
+    if tie_word_embeddings:
+        if apply_lora_to_output:
+            raise ValueError(
+                "apply_lora_to_output is incompatible with tie_word_embeddings,"
+                " as there would be no output to apply lora to!"
+            )
+        output_proj = TiedLinear(tok_embeddings)
+    else:
+        # TODO: quantize_base is not applied to final output_proj currently.
+        adapter_cls = DoRALinear if use_dora else LoRALinear
+        output_proj = (
+            adapter_cls(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha, dropout=lora_dropout)
+            if apply_lora_to_output
+            else nn.Linear(embed_dim, vocab_size, bias=False)
+        )
+    model = TransformerDecoder(
+        tok_embeddings=tok_embeddings,
+        layers=layer,
+        num_layers=num_layers,
+        max_seq_len=max_seq_len,
+        num_heads=num_heads,
+        head_dim=(embed_dim // num_heads),
+        norm=RMSNorm(embed_dim, eps=norm_eps),
+        output=output_proj,
+    )
+
+    if quantize_base:
+        # For QLoRA, we reparametrize 4-bit tensors to higher precision, and offload to CPU on the fly
+        # so as to not increase peak memory
+        model._register_state_dict_hook(
+            partial(
+                reparametrize_as_dtype_state_dict_post_hook,
+                # TODO this is clowny, figure out a better way to get what precision the rest
+                # of the model is in
+                dtype=tok_embeddings.weight.dtype,
+                offload_to_cpu=True,
+            )
+        )
+
+    return model
+
+
+def lora_falcon3_self_attention(
+    lora_modules: List[LORA_ATTN_MODULES],
+    *,
+    # MultiHeadAttention args
+    embed_dim: int,
+    num_heads: int,
+    num_kv_heads: int,
+    max_seq_len: int,
+    attn_dropout: float = 0.0,
+    rope_base: float = 1_000_042,
+    # LoRA args
+    lora_rank: int,
+    lora_alpha: float,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> MultiHeadAttention:
+    """
+    Return an instance of :func:`~torchtune.modules.MultiHeadAttention` with LoRA
+    applied to a subset of its linear layers
+
+    Args:
+        lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj",
+            "output_proj"}``.
+        embed_dim (int): embedding dimension for self-attention
+        num_heads (int): number of query heads. For MHA this is also the
+            number of heads for key and value
+        num_kv_heads (int): number of key and value heads. User should ensure
+            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
+            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
+        max_seq_len (int): maximum sequence length the model will be run with, as used
+            by :func:`~torchtune.modules.KVCache`
+        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+            Default: 0.0
+        rope_base (float): the base period of the RoPE embeddings. Default: 1_000_000.0
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): LoRA dropout probability. Default: 0.0
+        quantize_base (bool): Whether to quantize base model parameters for linear layers
+            LoRA is being applied to. Default is ``False``.
+
+    Returns:
+        MultiHeadAttention: instantiation of self-attention module with LoRA
+        applied to a subset of Q, K, V, output projections.
+
+    Raises:
+        ValueError: If lora_modules arg is an empty list
+    """
+    if not lora_modules:
+        raise ValueError(f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules")
+
+    head_dim = embed_dim // num_heads
+    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+    adapter_cls = DoRALinear if use_dora else LoRALinear
+    q_proj = (
+        adapter_cls(
+            embed_dim,
+            num_heads * head_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+            use_bias=False,
+            quantize_base=quantize_base,
+        )
+        if "q_proj" in lora_modules
+        else nn.Linear(embed_dim, num_heads * head_dim, bias=False)
+    )
+    k_proj = (
+        adapter_cls(
+            embed_dim,
+            num_kv_heads * head_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+            use_bias=False,
+            quantize_base=quantize_base,
+        )
+        if "k_proj" in lora_modules
+        else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
+    )
+    v_proj = (
+        adapter_cls(
+            embed_dim,
+            num_kv_heads * head_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+            use_bias=False,
+            quantize_base=quantize_base,
+        )
+        if "v_proj" in lora_modules
+        else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
+    )
+    output_proj = (
+        adapter_cls(
+            embed_dim,
+            embed_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+            quantize_base=quantize_base,
+        )
+        if "output_proj" in lora_modules
+        else nn.Linear(embed_dim, embed_dim, bias=False)
+    )
+    rope = Falcon3RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
+    self_attn = MultiHeadAttention(
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        q_proj=q_proj,
+        k_proj=k_proj,
+        v_proj=v_proj,
+        output_proj=output_proj,
+        pos_embeddings=rope,
+        kv_cache=None,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+    )
+    return self_attn
+
+
+def lora_falcon3_mlp(
+    *,
+    dim: int,
+    hidden_dim: int,
+    lora_rank: int,
+    lora_alpha: float,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> FeedForward:
+    adapter_cls = DoRALinear if use_dora else LoRALinear
+    gate_proj = adapter_cls(
+        in_dim=dim,
+        out_dim=hidden_dim,
+        rank=lora_rank,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+    down_proj = adapter_cls(
+        in_dim=hidden_dim,
+        out_dim=dim,
+        rank=lora_rank,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+    up_proj = adapter_cls(
+        in_dim=dim,
+        out_dim=hidden_dim,
+        rank=lora_rank,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+    return FeedForward(
+        gate_proj=gate_proj,
+        down_proj=down_proj,
+        up_proj=up_proj,
+    )
diff --git a/torchtune/models/falcon3/_convert_weights.py b/torchtune/models/falcon3/_convert_weights.py
new file mode 100644
index 0000000000..d6d6122f5c
--- /dev/null
+++ b/torchtune/models/falcon3/_convert_weights.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+
+from torchtune.models.convert_weights import get_mapped_key
+
+# state dict key mappings from HF's format to torchtune's format
+_FROM_HF = {
+    "model.embed_tokens.weight": "tok_embeddings.weight",
+    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attn.q_proj.weight",
+    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attn.k_proj.weight",
+    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attn.v_proj.weight",
+    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attn.output_proj.weight",
+    "model.layers.{}.mlp.gate_proj.weight": "layers.{}.mlp.w1.weight",
+    "model.layers.{}.mlp.up_proj.weight": "layers.{}.mlp.w3.weight",
+    "model.layers.{}.mlp.down_proj.weight": "layers.{}.mlp.w2.weight",
+    "model.layers.{}.input_layernorm.weight": "layers.{}.sa_norm.scale",
+    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.mlp_norm.scale",
+    "model.norm.weight": "norm.scale",
+    "lm_head.weight": "output.weight",
+}
+
+
+FALCON3_TIED_KEY = "lm_head.weight"
+
+
+def falcon3_hf_to_tune(
+    state_dict: Dict[str, torch.Tensor],
+    num_heads: int = 12,
+    num_kv_heads: int = 4,
+    dim: int = 3072,
+    head_dim: int = None,
+    tie_word_embeddings: bool = False,
+) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from HF's format to TorchTune's format, which contains the weights
+    of a Falcon3 model.
+    State dicts from multiple checkpoint files should be consolidated into a single state dict
+    before calling this function.
+    The logic is identical to :func:`~torchtune.models.convert_weights.hf_to_tune`, but may not load
+    output projection weights.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in HF's format.
+        num_heads (int): Number of heads in the model.
+        num_kv_heads (int): Number of heads in the key/value projection layers.
+        dim (int): Dimension of the model.
+        head_dim (int): Dimension of the head. If not provided, it will be calculated
+            as dim // num_heads.
+        tie_word_embeddings (bool): Whether the model's input and output word embeddings should be tied.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in torchtune's format.
+    """
+    converted_state_dict = {}
+    if head_dim is None:
+        head_dim = dim // num_heads
+
+    for key, value in state_dict.items():
+        if (
+            tie_word_embeddings and FALCON3_TIED_KEY in key
+        ):  # Skip loading the output projection weights
+            continue
+        if "rotary_emb.inv_freq" in key:  # Skip loading the position embeddings
+            continue
+
+        new_key = get_mapped_key(key, _FROM_HF)
+        converted_state_dict[new_key] = value
+    return converted_state_dict
+
+
+def falcon3_tune_to_hf(
+    state_dict: Dict[str, torch.Tensor],
+    num_heads: int = 12,
+    num_kv_heads: int = 4,
+    dim: int = 3072,
+    head_dim: int = None,
+    tie_word_embeddings: bool = False,
+):
+    """
+    Convert a state dict from torchtune's format to HF's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+        num_heads (int): Number of heads in the model.
+        num_kv_heads (int): Number of heads in the key/value projection layers.
+        dim (int): Dimension of the model.
+        head_dim (int): Dimension of the head. If not provided, it will be calculated
+            as dim // num_heads.
+        tie_word_embeddings (bool): Whether the model's input and output word embeddings should be tied.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in HF's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _FROM_HF.items()}
+
+    if head_dim is None:
+        head_dim = dim // num_heads
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    return converted_state_dict
diff --git a/torchtune/models/falcon3/_model_builders.py b/torchtune/models/falcon3/_model_builders.py
new file mode 100644
index 0000000000..acf224920a
--- /dev/null
+++ b/torchtune/models/falcon3/_model_builders.py
@@ -0,0 +1,375 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional
+
+from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType
+
+from torchtune.models.falcon3._component_builders import lora_falcon3, falcon3
+from torchtune.models.falcon3._tokenizer import FALCON3_SPECIAL_TOKENS, Falcon3Tokenizer
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.peft import LORA_ATTN_MODULES
+from torchtune.modules.tokenizers import parse_hf_tokenizer_json
+
+"""
+Model builders build specific instantiations using component builders. For example
+the falcon3 model builder uses the falcon3 component builder to create the
+falcon3 model series.
+"""
+def falcon3_tokenizer(
+    path: str,
+    merges_file: str = None,
+    special_tokens_path: Optional[str] = None,
+    max_seq_len: Optional[int] = None,
+    prompt_template: Optional[_TemplateType] = None,
+    **kwargs,
+) -> Falcon3Tokenizer:
+    """
+    Tokenizer for Falcon3.
+
+    Args:
+        path (str): path to the vocab.json file.
+        special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face
+            model files that contains all registered special tokens, or a local json file
+            structured similarly. Default is None to use the canonical Falcon3 special tokens.
+        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
+            Default: None
+        prompt_template (Optional[_TemplateType]): optional specified prompt template.
+            If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
+            class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
+            prepend/append tags. Default is None.
+
+    Returns:
+        Falcon3Tokenizer: Instantiation of the Falcon3 tokenizer
+    """
+    special_tokens = (
+        parse_hf_tokenizer_json(special_tokens_path)
+        if special_tokens_path is not None
+        else FALCON3_SPECIAL_TOKENS
+    )
+    template = (
+        _get_prompt_template(prompt_template) if prompt_template is not None else None
+    )
+    return Falcon3Tokenizer(
+        path=path,
+        special_tokens=special_tokens,
+        max_seq_len=max_seq_len,
+        prompt_template=template,
+        **kwargs,
+    )
+
+
+'''
+Models
+'''
+def falcon3_10b() -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 model initialized w/ the default 7B parameter values
+    from https://huggingface.co/tiiuae/Falcon3-7B
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model
+    """
+    return falcon3(
+        vocab_size=131072,
+        num_layers=40,
+        num_heads=12,
+        num_kv_heads=4,
+        embed_dim=3072,
+        intermediate_dim=23040,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+    )
+
+
+def falcon3_7b() -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 model initialized w/ the default 7B parameter values
+    from https://huggingface.co/tiiuae/Falcon3-7B
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model
+    """
+    return falcon3(
+        vocab_size=131072,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=4,
+        embed_dim=3072,
+        intermediate_dim=23040,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+    )
+
+
+
+def falcon3_3b() -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 model initialized w/ the default 7B parameter values
+    from https://huggingface.co/tiiuae/Falcon3-7B
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model
+    """
+    return falcon3(
+        vocab_size=131072,
+        num_layers=22,
+        num_heads=12,
+        num_kv_heads=4,
+        embed_dim=3072,
+        intermediate_dim=9216,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+    )
+
+
+def falcon3_1b() -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 model initialized w/ the default 7B parameter values
+    from https://huggingface.co/tiiuae/Falcon3-7B
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model
+    """
+    return falcon3(
+        vocab_size=131072,
+        num_layers=18,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=2048,
+        intermediate_dim=8192,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+    )
+'''
+LoRA 
+'''
+
+def lora_falcon3_10b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 7B model with LoRA enabled.
+
+    The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied
+    """
+    return lora_falcon3(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=131072,
+        num_layers=40,
+        num_heads=12,
+        num_kv_heads=4,
+        embed_dim=3072,
+        intermediate_dim=23040,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_falcon3_7b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 7B model with LoRA enabled.
+
+    The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied
+    """
+    return lora_falcon3(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=131072,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=4,
+        embed_dim=3072,
+        intermediate_dim=23040,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1_000_042,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_falcon3_3b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 7B model with LoRA enabled.
+
+    The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied
+    """
+    return lora_falcon3(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=131072,
+        num_layers=22,
+        num_heads=12,
+        num_kv_heads=4,
+        embed_dim=3072,
+        intermediate_dim=9216,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_falcon3_1b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Falcon3 7B model with LoRA enabled.
+
+    The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied
+    """
+    return lora_falcon3(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=131072,
+        num_layers=18,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=2048,
+        intermediate_dim=8192,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-06,
+        rope_base=1_000_042,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
diff --git a/torchtune/models/falcon3/_positional_embeddings.py b/torchtune/models/falcon3/_positional_embeddings.py
new file mode 100644
index 0000000000..12b9303571
--- /dev/null
+++ b/torchtune/models/falcon3/_positional_embeddings.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+
+from torch import nn
+
+
+class Falcon3RotaryPositionalEmbeddings(nn.Module):
+    """
+    RoPE Embeddings used in the Falcon3 model.
+    Ref: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+
+    This class is not numerically equivalent to the RoPE Embedding module
+    used by Llama2 and Llama3.
+
+    Args:
+        dim (int): Embedding dimension. This is usually set to the dim of each
+            head in the attention module computed as ``embed_dim`` // ``num_heads``
+        max_seq_len (int): Maximum expected sequence length for the
+            model, if exceeded the cached freqs will be recomputed
+        base (float): The base for the geometric progression used to compute
+            the rotation angles
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        max_seq_len: int = 8192,
+        base: float = 1_000_042,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.max_seq_len = max_seq_len
+        self.rope_init()
+
+    def rope_init(self):
+        theta = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim)
+        )
+        self.register_buffer("theta", theta, persistent=False)
+        self.build_rope_cache(self.max_seq_len)
+
+    def build_rope_cache(self, max_seq_len: int = 8192) -> None:
+        # Create position indexes `[0, 1, ..., max_seq_len - 1]`
+        seq_idx = torch.arange(
+            max_seq_len, dtype=self.theta.dtype, device=self.theta.device
+        )
+
+        # Outer product of theta and position index; output tensor has
+        # a shape of [max_seq_len, dim // 2]
+        idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float()
+
+        # We cache the cos and sin embeddings instead of the IDs. This helps
+        # ensure we have correct behavior when training with bf16
+        # Size: [max_seq_len, (dim * 2)]
+        freqs = torch.cat([idx_theta, idx_theta], dim=-1)
+        cache = torch.cat([freqs.cos(), freqs.sin()], dim=-1)
+        self.register_buffer("cache", cache, persistent=False)
+
+    def forward(
+        self, x: torch.Tensor, input_pos: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): input tensor with shape
+                [b, s, n_h, h_d]
+            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
+                of each token. During training, this is used to indicate the positions
+                of each token relative to its sample when packed, shape [b, s].
+                During inference, this indicates the position of the current token.
+                If none, assume the index of the token is its position id. Default is None.
+
+        Returns:
+            Tensor: output tensor with RoPE applied
+
+        Notation used for tensor shapes:
+            - b: batch size
+            - s: sequence length
+            - n_h: num heads
+            - h_d: head dim
+
+        TODO: The implementation below can be made more efficient
+        for inference.
+        """
+        # input tensor has shape [b, s, n_h, h_d]
+        seq_len = x.size(1)
+        head_dim = x.size(-1)
+
+        # extract the values based on whether input_pos is set or not. When
+        # input_pos is provided, we're in inference mode
+        rope_cache = (
+            self.cache[:seq_len] if input_pos is None else self.cache[input_pos]
+        )
+
+        # reshape the cache for broadcasting
+        # tensor has shape [b, s, 1, h_d * 2] if packed samples,
+        # otherwise has shape [1, s, 1, h_d * 2]
+        rope_cache = rope_cache.view(-1, seq_len, 1, head_dim * 2)
+
+        # [b, s, 1, h_d]
+        cos = rope_cache[..., :head_dim].to(x.dtype)
+        sin = rope_cache[..., head_dim:].to(x.dtype)
+
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        rotated = torch.cat((-x2, x1), dim=-1)
+
+        # cos: [b, s, 1, h_d]
+        # x: [b, s, n_h, h_d]
+        x_out = (x * cos) + (rotated * sin)
+        return x_out.type_as(x)
diff --git a/torchtune/models/falcon3/_tokenizer.py b/torchtune/models/falcon3/_tokenizer.py
new file mode 100644
index 0000000000..a5fc871cd0
--- /dev/null
+++ b/torchtune/models/falcon3/_tokenizer.py
@@ -0,0 +1,436 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import unicodedata
+from functools import lru_cache
+from typing import Any, Dict, List, Mapping, Optional, Tuple
+
+import regex as re
+
+from torchtune.data import ChatMLTemplate, Message, PromptTemplate, truncate
+from torchtune.modules.tokenizers import ModelTokenizer
+
+PRETOKENIZE_REGEX = (
+    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+)
+FALCON3_SPECIAL_TOKENS = {
+    ">>TITLE<<": 0,
+    ">>ABSTRACT<<": 1,
+    ">>INTRODUCTION<<": 2,
+    ">>SUMMARY<<": 3,
+    ">>COMMENT<<": 4,
+    ">>ANSWER<<": 5,
+    ">>QUESTION<<": 6,
+    ">>DOMAIN<<": 7,
+    ">>EMAIL_ADDRESS<<": 8,
+    ">>IP_ADDRESS<<": 9,
+    "<|startoftext|>": 10,
+    "<|endoftext|>": 11,
+    ">>IP_ADDRESS_0<<": 14,
+    ">>IP_ADDRESS_1<<": 15,
+    ">>IP_ADDRESS_2<<": 16,
+    ">>IP_ADDRESS_3<<": 17,
+    ">>IP_ADDRESS_4<<": 18,
+    ">>IP_ADDRESS_5<<": 19,
+    ">>IP_ADDRESS_6<<": 20,
+    ">>IP_ADDRESS_7<<": 21,
+    ">>IP_ADDRESS_8<<": 22,
+    ">>IP_ADDRESS_9<<": 23,
+    ">>PASSWORD<<": 24,
+    ">>KEY<<": 25
+}
+
+ENDOFTEXT = "<|endoftext|>"
+
+DEFAULT_FALCON3_TOKENIZER_BPE_CACHE_SIZE = 152064
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoid mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+            list(range(ord("!"), ord("~") + 1))
+            + list(range(ord("¡"), ord("¬") + 1))
+            + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Falcon3Tokenizer(ModelTokenizer):
+    """This class construct a Falcon3 tokenizer, based on GPT-2 byte-level BPE tokenization.
+
+    See <https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/falcon3/tokenization_falcon3.py>.
+
+    Args:
+        path (str): Path to tokenizer.json file.
+        special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is FALCON3_SPECIAL_TOKENS.
+        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
+            Default: None
+        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
+            to add structured text around the actual messages. The structured text is used in three scenarios:
+
+            - Task-specific templates to gear models for a particular task that it will expect after training
+            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
+              tags in Llama2 and in Mistral
+            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`
+
+            The extra text will still get tokenized as normal text, not as special tokens.
+            Default: None
+        errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace".
+            See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted
+            to an ID and is set to be this token instead. Defaults to ``<|endoftext|>``.
+        bos_token (Optional[str]): The beginning of sequence token. Defaults to None.
+        eos_token (str): The end of sequence token. Defaults to ``<|endoftext|>``.
+        pad_token (Optional[str]): The token used for padding. Defaults to ``<|endoftext|>``.
+        bpe_cache_size (int): BPE token cache size in Falcon3Tokenizer.
+            NOTE: large cache size will speed up tokenization, but the cache object will get really
+            large for long running processes (esp. for texts of language that do not use space between
+            word, e.g. Chinese); technically not a memory leak but appears as one.
+            By default, we set the cache size equals to size of the official Falcon3 tokenizer.
+
+    Example:
+        >>> tokenizer = Falcon3Tokenizer(path="/path/to/tokenizer.json", special_tokens=FALCON3_SPECIAL_TOKENS)
+        >>> tokenized_text = tokenizer.encode("Hello world!")
+        >>> print(tokenized_text)
+        [13955, 3139, 2024]
+    """
+
+    def __init__(
+            self,
+            path: str,
+            special_tokens: Dict[str, int] = FALCON3_SPECIAL_TOKENS,
+            max_seq_len: Optional[int] = None,
+            *,
+            prompt_template: Optional[PromptTemplate] = None,
+            errors: str = "replace",
+            unk_token: Optional[str] = None,
+            bos_token: Optional[str] = None,
+            eos_token: str = ENDOFTEXT,
+            pad_token: Optional[str] = ENDOFTEXT,
+            bpe_cache_size: int = DEFAULT_FALCON3_TOKENIZER_BPE_CACHE_SIZE,
+    ):
+        with open(path, encoding="utf-8") as tokenizer_file:
+            tok_dict = json.load(tokenizer_file)
+
+        self.encoder = tok_dict['model']['vocab']
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_merges = []
+        merges_handle = tok_dict['model']['merges']
+        for i, line in enumerate(merges_handle):
+            # line = line.strip()
+            bpe_merges.append(tuple(line))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+
+        self._bpe = lru_cache(maxsize=bpe_cache_size)(self._bpe_without_cache)
+
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+
+        self.special_tokens = special_tokens
+        self._special_tokens_reversed = {v: k for k, v in self.special_tokens.items()}
+
+        self.unk_id = None if unk_token is None else self.special_tokens[unk_token]
+        self.bos_id = None if bos_token is None else self.special_tokens[bos_token]
+        self.eos_id = None if eos_token is None else self.special_tokens[eos_token]
+        self.pad_id = None if pad_token is None else self.special_tokens[pad_token]
+
+        # Pattern for special tokens.
+        self._pattern_split_special_tokens = re.compile(r"(" + "|".join(special_tokens.keys()) + r")")
+
+        self.max_seq_len = max_seq_len
+
+        self.prompt_template = prompt_template
+
+    def _bpe_without_cache(self, token):
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self._bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.unk_id)
+
+    def encode(
+            self, text: str, add_bos: bool = True, add_eos: bool = True
+    ) -> List[int]:
+        """
+        Encode a string into a list of token ids.
+
+        Args:
+            text (str): The string to encode.
+            add_bos (bool): (Optional) Whether to add the beginning of sequence token.
+            add_eos (bool): (Optional) Whether to add the end of sequence token.
+
+        Returns:
+            List[int]: The list of token ids.
+
+        Note:
+            This method follows
+            <https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/tokenization_utils.py#L541> and
+        """
+
+        text = unicodedata.normalize("NFC", text)
+
+        tokens = self._pattern_split_special_tokens.split(text)
+
+        tokenized_text = []
+        for token in tokens:
+            if not token:
+                continue
+            if token in self.special_tokens:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token))
+
+        # Convert tokenized text to token ids.
+        token_ids = []
+        if add_bos and self.bos_id is not None:
+            token_ids.append(self.bos_id)
+        for token in tokenized_text:
+            if token in self.special_tokens:
+                token_id = self.special_tokens[token]
+            else:
+                token_id = self._convert_token_to_id(token)
+            token_ids.append(token_id)
+        if add_eos and self.eos_id is not None:
+            token_ids.append(self.eos_id)
+
+        return token_ids
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self._special_tokens_reversed.get(index, None)
+        if token is None:
+            return self.decoder.get(index)
+        return token
+
+    def _convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            "utf-8", errors=self.errors
+        )
+        return text
+
+    def decode(
+            self,
+            token_ids: List[int],
+            skip_special_tokens: bool = False,
+    ) -> str:
+        """
+        Decode a list of token ids into a string.
+
+        Args:
+            token_ids (List[int]): The list of token ids.
+            skip_special_tokens (bool): Whether the special tokens should be removed from the decoded string.
+
+        Returns:
+            str: The decoded string.
+        """
+        sub_texts = []
+        current_sub_text = []
+        for token_id in token_ids:
+            token = self._convert_id_to_token(token_id)
+            if token_id in self._special_tokens_reversed:
+                if current_sub_text:
+                    string = self._convert_tokens_to_string(current_sub_text)
+                    if string:
+                        sub_texts.append(string)
+                    current_sub_text = []
+                if not skip_special_tokens:
+                    sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self._convert_tokens_to_string(current_sub_text))
+
+        text = "".join(sub_texts)
+        return text
+
+    def tokenize_messages(
+            self,
+            messages: List[Message],
+            *,
+            add_eos: bool = True,
+    ) -> Tuple[List[int], List[bool]]:
+        """
+        Given a list of messages, return a list of tokens for the concatenated
+        and formatted messages.
+
+        Args:
+            messages (List[Message]): The message list to tokenize.
+            add_eos (bool): Wether to add the tokenizer's eos_id at the end of the
+                sequence of messages. Default is True.
+
+        Returns:
+            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.
+
+        Raises:
+            RuntimeError: If a message contains non-text content
+        """
+        assert not isinstance(self.prompt_template, ChatMLTemplate), (
+            "Using ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message."
+            "Please use a different template or set to None."
+        )
+        templated_messages = (
+            self.prompt_template(messages)
+            if self.prompt_template is not None
+            else messages
+        )
+
+        tokenized_messages = []
+        mask = []
+        for index, message in enumerate(templated_messages):
+            tokens = []
+
+            # message header
+            if message.role != "ipython":
+                # tokens.append(self.im_start_id)
+                tokens.extend(
+                    self.encode(f"{message.role}\n", add_bos=False, add_eos=False)
+                )
+
+            # message content
+            for item in message.content:
+                if item["type"] == "text":
+                    tokens.extend(
+                        self.encode(
+                            item["content"],
+                            add_bos=False,
+                            add_eos=False,
+                        )
+                    )
+                else:
+                    raise RuntimeError(
+                        f"Unsupported message content type: {item['type']}"
+                    )
+
+            # message footer
+            if message.role != "ipython" and (
+                    message.role != "assistant" or index != len(messages) - 1
+            ):
+                # tokens.append(self.im_end_id)
+                tokens.extend(self.encode("\n", add_bos=False, add_eos=False))
+
+            tokenized_messages.extend(tokens)
+            mask.extend([message.masked] * len(tokens))
+
+            # Break out early if we reach max_seq_len
+            if self.max_seq_len and len(tokenized_messages) >= self.max_seq_len:
+                break
+
+        # Add the End-Of-Sequence token
+        if add_eos:
+            tokenized_messages.append(self.eos_id)
+            mask.append(mask[-1])
+
+        # Finally, truncate if necessary
+        if self.max_seq_len:
+            tokenized_messages = truncate(
+                tokenized_messages, self.max_seq_len, self.eos_id if add_eos else None
+            )
+            mask = truncate(mask, self.max_seq_len, True if add_eos else None)
+
+        return tokenized_messages, mask
+
+    def __call__(
+            self, sample: Mapping[str, Any], inference: bool = False
+    ) -> Mapping[str, Any]:
+        """
+        Apply ``tokenize_messages`` to the "messages" field in the sample.
+
+        Args:
+            sample (Mapping[str, Any]): A sample with a "messages" field containing
+                a List[Message] to tokenize
+            inference (bool): Whether the template is being used for inference or not.
+
+        Returns:
+            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
+                and the "messages" field removed.
+            inference (bool): Whether the template is being used for inference or not.
+        """
+        messages = sample.pop("messages")
+        tokens, mask = self.tokenize_messages(messages)
+        sample["tokens"] = tokens
+        sample["mask"] = mask
+        return sample
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index a5d72af320..df93535dca 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -553,6 +553,16 @@ def load_checkpoint(self) -> Dict[str, Any]:
                 num_kv_heads=self._config["num_key_value_heads"],
                 dim=self._config["hidden_size"],
             )
+        elif self._model_type == ModelType.FALCON3:
+            from torchtune.models.falcon3._convert_weights import falcon3_hf_to_tune
+
+            converted_state_dict[training.MODEL_KEY] = falcon3_hf_to_tune(
+                merged_state_dict,
+                num_heads=self._config["num_attention_heads"],
+                num_kv_heads=self._config["num_key_value_heads"],
+                dim=self._config["hidden_size"],
+                tie_word_embeddings=self._config["tie_word_embeddings"],
+            )
         elif self._model_type == ModelType.QWEN2:
             from torchtune.models.qwen2._convert_weights import qwen2_hf_to_tune
 
@@ -662,6 +672,16 @@ def save_checkpoint(
                     num_kv_heads=self._config["num_key_value_heads"],
                     dim=self._config["hidden_size"],
                 )
+            elif self._model_type == ModelType.FALCON3:
+                from torchtune.models.falcon3._convert_weights import falcon3_tune_to_hf
+
+                state_dict[training.MODEL_KEY] = falcon3_tune_to_hf(
+                    state_dict[training.MODEL_KEY],
+                    num_heads=self._config["num_attention_heads"],
+                    num_kv_heads=self._config["num_key_value_heads"],
+                    dim=self._config["hidden_size"],
+                    tie_word_embeddings=self._config["tie_word_embeddings"],
+                )
             elif self._model_type == ModelType.QWEN2:
                 from torchtune.models.qwen2._convert_weights import qwen2_tune_to_hf
 
diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
index f8dc55452b..d38156599d 100644
--- a/torchtune/training/checkpointing/_utils.py
+++ b/torchtune/training/checkpointing/_utils.py
@@ -103,6 +103,7 @@ class ModelType(Enum):
         >>>         state_dict = my_custom_state_dict_mapping(state_dict)
     """
 
+    FALCON3: str = "falcon3"
     GEMMA: str = "gemma"
     GEMMA2: str = "gemma2"
     LLAMA2: str = "llama2"