From 91be2cf5abe111bc4cd4ecff89881855739cbe1b Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Sat, 6 Jul 2024 00:46:52 -0400 Subject: [PATCH] fix pretrianing data sizes and weights (#9627) Signed-off-by: Chen Cui Signed-off-by: Tugrul Konuk --- nemo/collections/llm/gpt/data/pre_training.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index 247ee1a1521a..46b407410d31 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -42,6 +42,9 @@ def __init__( paths = [paths] if weights is not None: assert len(weights) == len(paths) + if len(weights) == 1: + # weights must be None if there is only one dataset + weights = None self.paths = paths self.weights = weights @@ -90,7 +93,7 @@ def setup(self, stage: str = "") -> None: if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): # This is to make sure we only have one epoch on every validation iteration - num_val_samples = None + num_val_samples = None if self.weights is None else 1 train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples] self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(