diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index aaef714ef738..4bafdd97ba21 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import json import os from copy import deepcopy from pathlib import Path diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py index 823f6e07cd57..1350cbaa7edd 100644 --- a/nemo/collections/llm/recipes/baichuan2_7b.py +++ b/nemo/collections/llm/recipes/baichuan2_7b.py @@ -25,7 +25,7 @@ from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -254,8 +254,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -279,8 +281,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py index b6c640372074..2cd424ce5bf6 100644 --- a/nemo/collections/llm/recipes/chatglm3_6b.py +++ b/nemo/collections/llm/recipes/chatglm3_6b.py @@ -25,7 +25,7 @@ from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -254,8 +254,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -279,8 +281,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py index f05fd7cb2d13..e8af7f67bdbd 100644 --- a/nemo/collections/llm/recipes/finetune_default.py +++ b/nemo/collections/llm/recipes/finetune_default.py @@ -21,9 +21,11 @@ import nemo.lightning as nl from nemo.collections import llm from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs +from nemo.collections.llm.peft import DoRA, LoRA from nemo.collections.llm.recipes.log.default import tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks import PEFT def default_finetune_recipe( @@ -158,3 +160,41 @@ def nemo_resume(model_id: str) -> run.Config[nl.AutoResume]: nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path=f"nemo://{model_id}"), ) + + +@run.cli.factory(name='lora') +def lora() -> run.Config[PEFT]: + """ + Factory function to create a LoRA configuration. + + Returns: + run.Config[PEFT]: Configuration for the LoRA class. + + Examples: + CLI usage: + $ nemo llm finetune -f llama3_8b peft=lora + + Python API usage: + >>> lora_config = lora() + >>> print(lora_config) + """ + return run.Config(LoRA) + + +@run.cli.factory(name='dora') +def dora() -> run.Config[PEFT]: + """ + Factory function to create a DoRA configuration. + + Returns: + run.Config[PEFT]: Configuration for the DoRA class. + + Examples: + CLI usage: + $ nemo llm finetune -f llama3_8b peft=dora + + Python API usage: + >>> dora_config = dora() + >>> print(dora_config) + """ + return run.Config(DoRA) diff --git a/nemo/collections/llm/recipes/gemma2_27b.py b/nemo/collections/llm/recipes/gemma2_27b.py index 2025bd570503..d6b41c0a221c 100644 --- a/nemo/collections/llm/recipes/gemma2_27b.py +++ b/nemo/collections/llm/recipes/gemma2_27b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.gemma2 import gemma2_model, gemma2_trainer from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger @@ -191,8 +191,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -220,8 +222,8 @@ def finetune_recipe( recipe.optim.config.lr = 5e-6 recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 2 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.trainer.strategy.tensor_model_parallel_size = 4 recipe.optim.config.lr = 1e-4 else: diff --git a/nemo/collections/llm/recipes/gemma2_2b.py b/nemo/collections/llm/recipes/gemma2_2b.py index e1aa3ad4be86..138140d0515d 100644 --- a/nemo/collections/llm/recipes/gemma2_2b.py +++ b/nemo/collections/llm/recipes/gemma2_2b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.gemma2 import gemma2_model, gemma2_trainer from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger @@ -191,8 +191,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -218,8 +220,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/gemma2_9b.py b/nemo/collections/llm/recipes/gemma2_9b.py index 8117102f1b75..c49ac0246307 100644 --- a/nemo/collections/llm/recipes/gemma2_9b.py +++ b/nemo/collections/llm/recipes/gemma2_9b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.gemma2 import gemma2_model, gemma2_trainer from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger @@ -191,8 +191,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -219,8 +221,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.optim.config.lr = 5e-6 recipe.trainer.strategy.tensor_model_parallel_size = 4 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py index 8798af436a9c..8bdf89696d56 100644 --- a/nemo/collections/llm/recipes/gemma_2b.py +++ b/nemo/collections/llm/recipes/gemma_2b.py @@ -24,7 +24,7 @@ from nemo.collections.llm import GemmaConfig2B, GemmaModel from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -253,8 +253,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -284,8 +286,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.context_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py index 0bfd62b33e9e..46c91e27575a 100644 --- a/nemo/collections/llm/recipes/gemma_7b.py +++ b/nemo/collections/llm/recipes/gemma_7b.py @@ -24,7 +24,7 @@ from nemo.collections.llm import GemmaConfig7B, GemmaModel from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -256,8 +256,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -287,8 +289,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py index d71f3791a0af..5f08d82bd888 100644 --- a/nemo/collections/llm/recipes/llama31_405b.py +++ b/nemo/collections/llm/recipes/llama31_405b.py @@ -26,7 +26,7 @@ from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.llm.gpt.model.llama import Llama31Config405B, LlamaModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -266,7 +266,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. seq_length (int): Maximum number of tokens per microbatch. packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. By default, this value equals performance_mode. @@ -296,7 +297,7 @@ def finetune_recipe( if num_nodes is None: if peft_scheme is None or peft_scheme.lower() == 'none': num_nodes = 12 - elif peft_scheme.lower() == 'lora': + elif peft_scheme.lower() in ['lora', 'dora']: num_nodes = 3 recipe = default_finetune_recipe( @@ -307,8 +308,8 @@ def finetune_recipe( recipe.trainer.strategy.pipeline_model_parallel_size = 14 recipe.data.global_batch_size = 6 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.peft.dim = 16 recipe.peft.alpha = 32 recipe.optim.config.use_distributed_optimizer = False @@ -348,7 +349,8 @@ def finetune_performance_optimizations( Args: recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added - peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for performance-optimized fine-tuning. diff --git a/nemo/collections/llm/recipes/llama31_70b.py b/nemo/collections/llm/recipes/llama31_70b.py index 37e809e5bc8f..3120fedd7923 100644 --- a/nemo/collections/llm/recipes/llama31_70b.py +++ b/nemo/collections/llm/recipes/llama31_70b.py @@ -26,7 +26,7 @@ from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.llm.gpt.model.llama import Llama31Config70B, LlamaModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -266,7 +266,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. seq_length (int): Maximum number of tokens per microbatch. packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. By default, this value equals performance_mode. @@ -300,7 +301,7 @@ def finetune_recipe( if num_nodes is None: if peft_scheme is None or peft_scheme.lower() == 'none': num_nodes = 4 - elif peft_scheme.lower() == 'lora': + elif peft_scheme.lower() in ['lora', 'dora']: num_nodes = 1 recipe = default_finetune_recipe( @@ -310,8 +311,8 @@ def finetune_recipe( recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.peft.dim = 16 recipe.peft.alpha = 32 recipe.optim.config.use_distributed_optimizer = False @@ -349,7 +350,8 @@ def finetune_performance_optimizations( Args: recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added - peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for performance-optimized fine-tuning. diff --git a/nemo/collections/llm/recipes/llama31_8b.py b/nemo/collections/llm/recipes/llama31_8b.py index 32a77ce076f2..62514940b678 100644 --- a/nemo/collections/llm/recipes/llama31_8b.py +++ b/nemo/collections/llm/recipes/llama31_8b.py @@ -26,7 +26,7 @@ from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.llm.gpt.model.llama import Llama31Config8B, LlamaModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -266,7 +266,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. seq_length (int): Maximum number of tokens per microbatch. packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. By default, this value equals performance_mode. @@ -303,8 +304,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.peft.dim = 8 recipe.peft.alpha = 16 recipe.optim.config.use_distributed_optimizer = False @@ -341,7 +342,8 @@ def finetune_performance_optimizations( Args: recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added - peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for performance-optimized fine-tuning. diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index 93aeb5c07dc1..8b61bff80e01 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -26,7 +26,7 @@ from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -263,7 +263,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. seq_length (int): Maximum number of tokens per microbatch. packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. By default, this value equals performance_mode. @@ -297,7 +298,7 @@ def finetune_recipe( if num_nodes is None: if peft_scheme is None or peft_scheme.lower() == 'none': num_nodes = 4 - elif peft_scheme.lower() == 'lora': + elif peft_scheme.lower() in ['lora', 'dora']: num_nodes = 1 recipe = default_finetune_recipe( @@ -307,8 +308,8 @@ def finetune_recipe( recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.peft.dim = 16 recipe.peft.alpha = 32 recipe.optim.config.use_distributed_optimizer = False @@ -346,7 +347,8 @@ def finetune_performance_optimizations( Args: recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added - peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for performance-optimized fine-tuning. diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 3e26f8fe1082..36b20c12ddb2 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -26,7 +26,7 @@ from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -250,7 +250,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. seq_length (int): Maximum number of tokens per microbatch. packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. By default, this value equals performance_mode. @@ -287,8 +288,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.peft.dim = 8 recipe.peft.alpha = 16 recipe.optim.config.use_distributed_optimizer = False @@ -325,7 +326,8 @@ def finetune_performance_optimizations( Args: recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added - peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for performance-optimized fine-tuning. diff --git a/nemo/collections/llm/recipes/mistral_7b.py b/nemo/collections/llm/recipes/mistral_7b.py index 3bc1e568185a..9e2d2e256fbe 100644 --- a/nemo/collections/llm/recipes/mistral_7b.py +++ b/nemo/collections/llm/recipes/mistral_7b.py @@ -24,9 +24,8 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -207,8 +206,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -237,8 +238,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py index 7d9fa1d792e9..a10f8ae804b8 100644 --- a/nemo/collections/llm/recipes/mistral_nemo_12b.py +++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py @@ -24,9 +24,8 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -255,8 +254,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -285,8 +286,10 @@ def finetune_recipe( ) if peft_scheme is None or peft_scheme.lower() == 'none': recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config( + PEFT_STR2CLS[peft_scheme.lower()], target_modules=['linear_qkv', 'linear_proj'], dim=32 + ) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 16e6168e649b..ec1641a08d80 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -24,9 +24,8 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x22B, MixtralModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -227,7 +226,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: ), run.Config( MegatronCommOverlapCallback, - overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing + overlap_param_gather_with_optimizer_step=False, # Currently disabled due to issue with checkpointing align_param_gather=True, ), ] @@ -259,8 +258,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given + maximum seq_length for better efficiency. Returns: run.Partial: Partial configuration for fine-tuning. @@ -286,8 +287,10 @@ def finetune_recipe( recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 14 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config( + PEFT_STR2CLS[peft_scheme.lower()], target_modules=['linear_qkv', 'linear_proj'], dim=32 + ) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index 5fbb0ac22c61..d06e22fc2180 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -24,9 +24,8 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -222,7 +221,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: run.Config(MegatronTokenDropCallback), run.Config( MegatronCommOverlapCallback, - overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing. + overlap_param_gather_with_optimizer_step=False, # Currently disabled due to issue with checkpointing. align_param_gather=True, ), ] @@ -254,8 +253,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -280,8 +281,10 @@ def finetune_recipe( recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 8 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config( + PEFT_STR2CLS[peft_scheme.lower()], target_modules=['linear_qkv', 'linear_proj'], dim=32 + ) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/nemotron3_22b.py b/nemo/collections/llm/recipes/nemotron3_22b.py index 2dd9c3ff5205..4c763301bc52 100644 --- a/nemo/collections/llm/recipes/nemotron3_22b.py +++ b/nemo/collections/llm/recipes/nemotron3_22b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Optional +from typing import Optional import lightning.pytorch as pl import nemo_run as run @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer @@ -239,8 +239,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -265,8 +267,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/nemotron3_4b.py b/nemo/collections/llm/recipes/nemotron3_4b.py index c208ee740265..fc6f09a09358 100644 --- a/nemo/collections/llm/recipes/nemotron3_4b.py +++ b/nemo/collections/llm/recipes/nemotron3_4b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer @@ -191,8 +191,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -216,8 +218,8 @@ def finetune_recipe( ) if peft_scheme is None or peft_scheme.lower() == 'none': recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py index 7799512c6260..f60463330cad 100644 --- a/nemo/collections/llm/recipes/nemotron3_8b.py +++ b/nemo/collections/llm/recipes/nemotron3_8b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Optional +from typing import Optional import lightning.pytorch as pl import nemo_run as run @@ -21,8 +21,7 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer @@ -256,8 +255,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -282,8 +283,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py index ad0f884b0d3b..49f92fcc1616 100644 --- a/nemo/collections/llm/recipes/nemotron4_15b.py +++ b/nemo/collections/llm/recipes/nemotron4_15b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Optional +from typing import Optional import lightning.pytorch as pl import nemo_run as run @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer @@ -228,8 +228,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -254,8 +256,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 4 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py index b22abc43d558..14d4c0f32d11 100644 --- a/nemo/collections/llm/recipes/nemotron4_340b.py +++ b/nemo/collections/llm/recipes/nemotron4_340b.py @@ -12,16 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Optional +from typing import Optional import lightning.pytorch as pl import nemo_run as run import torch -from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer @@ -240,8 +239,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -268,8 +269,8 @@ def finetune_recipe( recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 12 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.optim.config.lr = 1e-4 diff --git a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py index 1cbc877dc33e..73bbe4735adb 100644 --- a/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py +++ b/nemo/collections/llm/recipes/phi3_mini_4k_instruct.py @@ -25,7 +25,7 @@ from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.llm.gpt.model.phi3mini import Phi3ConfigMini, Phi3Model -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -222,8 +222,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', - 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. seq_length (int): Maximum number of tokens per microbatch. packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given maximum seq_length for better efficiency. By default, this value equals performance_mode. @@ -260,8 +260,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 1 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.peft.dim = 8 recipe.peft.alpha = 16 recipe.optim.config.use_distributed_optimizer = False diff --git a/nemo/collections/llm/recipes/qwen2_1p5b.py b/nemo/collections/llm/recipes/qwen2_1p5b.py index a3d705c4fb3a..99ba5cd907fc 100644 --- a/nemo/collections/llm/recipes/qwen2_1p5b.py +++ b/nemo/collections/llm/recipes/qwen2_1p5b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -218,8 +220,8 @@ def finetune_recipe( ) if peft_scheme is None or peft_scheme.lower() == 'none': recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/qwen2_500m.py b/nemo/collections/llm/recipes/qwen2_500m.py index 08541ca9e421..96d99c271c85 100644 --- a/nemo/collections/llm/recipes/qwen2_500m.py +++ b/nemo/collections/llm/recipes/qwen2_500m.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -218,8 +220,8 @@ def finetune_recipe( ) if peft_scheme is None or peft_scheme.lower() == 'none': recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/qwen2_72b.py b/nemo/collections/llm/recipes/qwen2_72b.py index c0bc9bf40611..33bb0dd40835 100644 --- a/nemo/collections/llm/recipes/qwen2_72b.py +++ b/nemo/collections/llm/recipes/qwen2_72b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -221,8 +223,8 @@ def finetune_recipe( recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.optim.config.lr = 1e-4 else: diff --git a/nemo/collections/llm/recipes/qwen2_7b.py b/nemo/collections/llm/recipes/qwen2_7b.py index 67bcc5e953bf..2e62176a408e 100644 --- a/nemo/collections/llm/recipes/qwen2_7b.py +++ b/nemo/collections/llm/recipes/qwen2_7b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -219,8 +221,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/starcoder2_15b.py b/nemo/collections/llm/recipes/starcoder2_15b.py index 14b53809111a..e424cb67dba4 100644 --- a/nemo/collections/llm/recipes/starcoder2_15b.py +++ b/nemo/collections/llm/recipes/starcoder2_15b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -219,8 +221,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 4 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/starcoder2_3b.py b/nemo/collections/llm/recipes/starcoder2_3b.py index 3ee81522ebc9..faf0b416c56a 100644 --- a/nemo/collections/llm/recipes/starcoder2_3b.py +++ b/nemo/collections/llm/recipes/starcoder2_3b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -219,8 +221,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/starcoder2_7b.py b/nemo/collections/llm/recipes/starcoder2_7b.py index 96b5ab36b876..091e882cd932 100644 --- a/nemo/collections/llm/recipes/starcoder2_7b.py +++ b/nemo/collections/llm/recipes/starcoder2_7b.py @@ -20,7 +20,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -194,8 +194,10 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. - packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training + efficiency. Default sequence length is 2048. Returns: run.Partial: Partial configuration for fine-tuning. @@ -219,8 +221,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/starcoder_15b.py b/nemo/collections/llm/recipes/starcoder_15b.py index d87788be5613..382d0eb4d8ca 100644 --- a/nemo/collections/llm/recipes/starcoder_15b.py +++ b/nemo/collections/llm/recipes/starcoder_15b.py @@ -23,7 +23,7 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.model.starcoder import StarcoderConfig15B, StarcoderModel -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing @@ -280,7 +280,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for fine-tuning. @@ -302,8 +303,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.pipeline_model_parallel_size = 8 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/t5_11b.py b/nemo/collections/llm/recipes/t5_11b.py index 8baf54b4f42f..ee7323aa044f 100644 --- a/nemo/collections/llm/recipes/t5_11b.py +++ b/nemo/collections/llm/recipes/t5_11b.py @@ -24,7 +24,7 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_trainer, nemo_resume from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed @@ -229,7 +229,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for fine-tuning. @@ -279,8 +280,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 4 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/t5_220m.py b/nemo/collections/llm/recipes/t5_220m.py index 27feb43837fb..edc9fdba62d7 100644 --- a/nemo/collections/llm/recipes/t5_220m.py +++ b/nemo/collections/llm/recipes/t5_220m.py @@ -24,7 +24,7 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_trainer, nemo_resume from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed @@ -229,7 +229,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for fine-tuning. @@ -279,8 +280,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 1 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") diff --git a/nemo/collections/llm/recipes/t5_3b.py b/nemo/collections/llm/recipes/t5_3b.py index 333661d97117..82772e1b865a 100644 --- a/nemo/collections/llm/recipes/t5_3b.py +++ b/nemo/collections/llm/recipes/t5_3b.py @@ -24,7 +24,7 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.peft import PEFT_STR2CLS from nemo.collections.llm.recipes.finetune_default import default_finetune_trainer, nemo_resume from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed @@ -229,7 +229,8 @@ def finetune_recipe( name (str): Name of the fine-tuning run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. - peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. Returns: run.Partial: Partial configuration for fine-tuning. @@ -279,8 +280,8 @@ def finetune_recipe( if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 5e-6 - elif peft_scheme.lower() == 'lora': - recipe.peft = run.Config(LoRA) + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) recipe.optim.config.lr = 1e-4 else: raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")