Skip to content

Commit

Permalink
Merge branch 'main' into ko3n1g/ci/small-fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g authored Nov 4, 2024
2 parents b093c25 + a5c8413 commit 8220dff
Show file tree
Hide file tree
Showing 19 changed files with 328 additions and 291 deletions.
4 changes: 2 additions & 2 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@
MixtralModel,
Nemotron3Config4B,
Nemotron3Config8B,
Nemotron3Config22B,
Nemotron4Config15B,
Nemotron4Config22B,
Nemotron4Config340B,
NemotronConfig,
NemotronModel,
Expand Down Expand Up @@ -138,8 +138,8 @@
"NemotronModel",
"Nemotron3Config4B",
"Nemotron3Config8B",
"Nemotron3Config22B",
"Nemotron4Config15B",
"Nemotron4Config22B",
"Nemotron4Config340B",
"NemotronConfig",
"SSMConfig",
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@
from nemo.collections.llm.gpt.model.nemotron import (
Nemotron3Config4B,
Nemotron3Config8B,
Nemotron3Config22B,
Nemotron4Config15B,
Nemotron4Config22B,
Nemotron4Config340B,
NemotronConfig,
NemotronModel,
Expand Down Expand Up @@ -137,7 +137,7 @@
"Nemotron3Config4B",
"Nemotron3Config8B",
"Nemotron4Config15B",
"Nemotron4Config22B",
"Nemotron3Config22B",
"Nemotron4Config340B",
"NemotronModel",
"CodeLlamaConfig7B",
Expand Down
20 changes: 11 additions & 9 deletions nemo/collections/llm/gpt/model/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class NemotronConfig(GPTConfig):
persist_layer_norm: bool = True
bias_dropout_add_fusion: bool = False
layernorm_zero_centered_gamma: bool = True
cross_entropy_loss_fusion: bool = True

# Nemotron3Config4B as default configs
num_layers: int = 32
Expand Down Expand Up @@ -87,27 +88,27 @@ class Nemotron3Config8B(NemotronConfig):


@dataclass
class Nemotron4Config15B(NemotronConfig):
num_layers: int = 32
class Nemotron3Config22B(NemotronConfig):
num_layers: int = 40
seq_length: int = 4096
hidden_size: int = 6144
ffn_hidden_size: int = 24576
num_attention_heads: int = 48
num_query_groups: Optional[int] = 8
num_query_groups: Optional[int] = None
kv_channels: Optional[int] = None
init_method_std: float = 0.0134
init_method_std: float = 0.008


@dataclass
class Nemotron4Config22B(NemotronConfig):
num_layers: int = 40
class Nemotron4Config15B(NemotronConfig):
num_layers: int = 32
seq_length: int = 4096
hidden_size: int = 6144
ffn_hidden_size: int = 24576
num_attention_heads: int = 48
num_query_groups: Optional[int] = None
num_query_groups: Optional[int] = 8
kv_channels: Optional[int] = None
init_method_std: float = 0.008
init_method_std: float = 0.0134


@dataclass
Expand Down Expand Up @@ -141,6 +142,7 @@ def init(self) -> NemotronModel:
def apply(self, output_path: Path) -> Path:
from transformers import NemotronForCausalLM

print('Start converting Nemotron model..')
source = NemotronForCausalLM.from_pretrained(str(self), torch_dtype='auto')
target = self.init()
trainer = self.nemo_setup(target)
Expand Down Expand Up @@ -357,8 +359,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
"NemotronConfig",
"Nemotron3Config4B",
"Nemotron3Config8B",
"Nemotron3Config22B",
"Nemotron4Config15B",
"Nemotron4Config22B",
"Nemotron4Config340B",
"NemotronModel",
]
12 changes: 6 additions & 6 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@
nemotron,
nemotron3_4b,
nemotron3_8b,
nemotron3_22b,
nemotron3_22b_16k,
nemotron3_22b_64k,
nemotron4_15b,
nemotron4_15b_16k,
nemotron4_15b_64k,
nemotron4_22b,
nemotron4_22b_16k,
nemotron4_22b_64k,
nemotron4_340b,
qwen2,
qwen2_1p5b,
Expand Down Expand Up @@ -100,12 +100,12 @@
"nemotron",
"nemotron3_4b",
"nemotron3_8b",
"nemotron3_22b",
"nemotron3_22b_16k",
"nemotron3_22b_64k",
"nemotron4_15b",
"nemotron4_15b_16k",
"nemotron4_15b_64k",
"nemotron4_22b",
"nemotron4_22b_16k",
"nemotron4_22b_64k",
"nemotron4_340b",
"t5_220m",
"t5_3b",
Expand Down
18 changes: 9 additions & 9 deletions nemo/collections/llm/recipes/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from nemo.collections.llm.gpt.model.nemotron import (
Nemotron3Config4B,
Nemotron3Config8B,
Nemotron3Config22B,
Nemotron4Config15B,
Nemotron4Config22B,
Nemotron4Config340B,
NemotronModel,
)
Expand All @@ -37,9 +37,9 @@ def nemotron_model(version: str) -> run.Config[pl.LightningModule]:
A function to create a Nemotron models.
Args:
version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",
version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",\
"nemotron3_22b", "nemotron3_22b_16k", "nemotron3_22b_64k",
"nemotron4_15b", "nemotron4_15b_16k", "nemotron4_15b_64k",
"nemotron4_22b", "nemotron4_22b_16k", "nemotron4_22b_64k",
"nemotron4_340b"].
Returns:
Expand All @@ -50,18 +50,18 @@ def nemotron_model(version: str) -> run.Config[pl.LightningModule]:
config = run.Config(Nemotron3Config4B)
elif version == "nemotron3_8b":
config = run.Config(Nemotron3Config8B)
elif version == "nemotron3_22b":
config = run.Config(Nemotron3Config22B)
elif version == "nemotron3_22b_16k":
config = run.Config(Nemotron3Config22B, seq_length=16384)
elif version == "nemotron3_22b_64k":
config = run.Config(Nemotron3Config22B, seq_length=65536)
elif version == "nemotron4_15b":
config = run.Config(Nemotron4Config15B)
elif version == "nemotron4_15b_16k":
config = run.Config(Nemotron4Config15B, seq_length=16384)
elif version == "nemotron4_15b_64k":
config = run.Config(Nemotron4Config15B, seq_length=65536)
elif version == "nemotron4_22b":
config = run.Config(Nemotron4Config22B)
elif version == "nemotron4_22b_16k":
config = run.Config(Nemotron4Config22B, seq_length=16384)
elif version == "nemotron4_22b_64k":
config = run.Config(Nemotron4Config22B, seq_length=65536)
elif version == "nemotron4_340b":
config = run.Config(Nemotron4Config340B)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,30 @@
import pytorch_lightning as pl
import torch

from nemo.collections.llm.api import pretrain
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.peft.lora import LoRA
from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

NAME = "nemotron4_22b"
NAME = "nemotron3_22b"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Factory function to create a Nemotron4 22b model configuration.
Factory function to create a Nemotron3 22B model configuration.
Returns:
run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b model.
run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b model.
Examples:
CLI usage:
$ nemo llm pretrain model=nemotron4_22b ...
$ nemo llm pretrain model=nemotron3_22b ...
Python API usage:
>>> model_config = model()
Expand Down Expand Up @@ -85,7 +87,7 @@ def pretrain_recipe(
fn=pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Nemotron4 22b model.
Create a pre-training recipe for Nemotron3 22B model.
This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down Expand Up @@ -124,8 +126,8 @@ def pretrain_recipe(
Examples:
CLI usage:
$ nemo llm pretrain --factory nemotron4_22b
$ nemo llm pretrain --factory "nemotron4_22b(num_nodes=1, name='my_nemotron_pretrain')"
$ nemo llm pretrain --factory nemotron3_22b
$ nemo llm pretrain --factory "nemotron3_22b(num_nodes=1, name='my_nemotron_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
Expand Down Expand Up @@ -181,7 +183,7 @@ def pretrain_recipe(

def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Nemotron4 22B model.
Create a performance-optimized pre-training recipe for Nemotron3 22B model.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Expand Down Expand Up @@ -214,3 +216,61 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
)
)
return recipe


@run.cli.factory(target=finetune, name=NAME)
def finetune_recipe(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
packed_sequence: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for Nemotron3 22B model.
This function sets up a complete configuration for fine-tuning, including
model, trainer, data, logging, optimization, and resumption settings.
The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the fine-tuning run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
Returns:
run.Partial: Partial configuration for fine-tuning.
Examples:
CLI usage:
$ nemo llm finetune --factory nemotron3_22b
Python API usage:
>>> recipe = finetune_recipe(name="nemotron3_22b_finetune", num_nodes=8)
>>> print(recipe)
Note:
This recipe uses the SQuAD dataset for fine-tuning. For more information
on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
`examples/llm/finetune/` directory.
"""

recipe = default_finetune_recipe(
model(), "thhaus/nemotron3-22b-hf", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)
if peft_scheme is None or peft_scheme.lower() == 'none':
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.optim.config.lr = 5e-6
elif peft_scheme.lower() == 'lora':
recipe.peft = run.Config(LoRA)
recipe.optim.config.lr = 1e-4
else:
raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")

# some settings currently do not function correctly with finetuning
recipe.model.config.cross_entropy_loss_fusion = False
return recipe
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,20 @@
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.utils.exp_manager import TimingCallback

NAME = "nemotron4_22b_16k"
NAME = "nemotron3_22b_16k"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Factory function to create a Nemotron4 22b model with 16k sequence length.
Factory function to create a Nemotron3 22B model with 16k sequence length.
Returns:
run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b and 16k sequence length model.
run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b and 16k sequence length model.
Examples:
CLI usage:
$ nemo llm pretrain model=nemotron4_22b_16k ...
$ nemo llm pretrain model=nemotron3_22b_16k ...
Python API usage:
>>> model_config = model()
Expand Down Expand Up @@ -83,7 +83,7 @@ def pretrain_recipe(
fn=pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Nemotron4 22b model with 16k sequence length.
Create a pre-training recipe for Nemotron3 22B model with 16k sequence length.
This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down Expand Up @@ -121,8 +121,8 @@ def pretrain_recipe(
Examples:
CLI usage:
$ nemo llm pretrain --factory nemotron4_22b_16k
$ nemo llm pretrain --factory "nemotron4_22b_16k(num_nodes=1, name='my_nemotron_pretrain')"
$ nemo llm pretrain --factory nemotron3_22b_16k
$ nemo llm pretrain --factory "nemotron3_22b_16k(num_nodes=1, name='my_nemotron_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,20 @@
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.utils.exp_manager import TimingCallback

NAME = "nemotron4_22b_64k"
NAME = "nemotron3_22b_64k"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Factory function to create a Nemotron4 22b model with 64k sequence length.
Factory function to create a Nemotron3 22B model with 64k sequence length.
Returns:
run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b and 64k sequence length model.
run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b and 64k sequence length model.
Examples:
CLI usage:
$ nemo llm pretrain model=nemotron4_22b_64k ...
$ nemo llm pretrain model=nemotron3_22b_64k ...
Python API usage:
>>> model_config = model()
Expand Down Expand Up @@ -83,7 +83,7 @@ def pretrain_recipe(
fn=pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Nemotron4 22b model with 16k sequence length.
Create a pre-training recipe for Nemotron3 22B model with 16k sequence length.
This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down Expand Up @@ -121,8 +121,8 @@ def pretrain_recipe(
Examples:
CLI usage:
$ nemo llm pretrain --factory nemotron4_22b_64k
$ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
$ nemo llm pretrain --factory nemotron3_22b_64k
$ nemo llm pretrain --factory "nemotron3_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2)
Expand Down
Loading

0 comments on commit 8220dff

Please sign in to comment.