add support for new mcore ds features (NVIDIA#9388)

* add validation_drop_last and add_extra_token params support for mcore ds Signed-off-by: dimapihtar <[email protected]> * pad samples with dummy tokens only Signed-off-by: dimapihtar <[email protected]> * Apply isort and black reformatting Signed-off-by: dimapihtar <[email protected]> * use no_seqlen_plus_one_input_tokens as mcore's add_extra_token Signed-off-by: dimapihtar <[email protected]> * revert config Signed-off-by: dimapihtar <[email protected]> * revert config Signed-off-by: dimapihtar <[email protected]> * set train_valid_test_num_samples[1] to None Signed-off-by: dimapihtar <[email protected]> * add test case when validation_drop_last is False Signed-off-by: dimapihtar <[email protected]> * revert config Signed-off-by: dimapihtar <[email protected]> * set validation_drop_last as True by default Signed-off-by: dimapihtar <[email protected]> * Update nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py Co-authored-by: jbaczek <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> * Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py Co-authored-by: jbaczek <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> --------- Signed-off-by: dimapihtar <[email protected]> Signed-off-by: dimapihtar <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> Co-authored-by: dimapihtar <[email protected]> Co-authored-by: jbaczek <[email protected]>
rohitrango · Jun 11, 2024 · b84dfa0 · b84dfa0
1 parent 551f911
commit b84dfa0
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 5 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -2398,6 +2398,7 @@ jobs:
             model.activations_checkpoint_method=block \
             model.activations_checkpoint_granularity=full \
             model.activations_checkpoint_num_layers=1 \
+            model.data.validation_drop_last=False \
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
 
@@ -2432,6 +2433,7 @@ jobs:
             model.activations_checkpoint_method=block \
             model.activations_checkpoint_granularity=full \
             model.activations_checkpoint_num_layers=1 \
+            model.data.validation_drop_last=False \
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
         

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
@@ -91,8 +91,7 @@ def __len__(self):
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
-    def __iter__(self):
-        ...
+    def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):
@@ -107,7 +106,7 @@ def __iter__(self):
         indices = range(self.consumed_samples, self.total_samples)
         if (not self.drop_last) and self.pad_samples_to_global_batch_size:
             pad_samples_num = -len(indices) % self.global_batch_size
-            pad_indices = range(-1, -pad_samples_num - 1, -1)
+            pad_indices = [None] * pad_samples_num
             indices = chain(indices, pad_indices)
 
         for idx in indices:

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self):
         # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below).
         # Setting N = 1 we force E to be 1 as well
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[1] = 1
-
+            train_valid_test_num_samples[1] = None
         # Add extra FIM tokens to tokenizer
         if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron':
             fim_tokens = self.cfg.data.fim.extra_tokens
@@ -1498,6 +1497,7 @@ def build_train_valid_test_datasets(self):
             is_dataset_built_on_rank = lambda: True
 
             mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
+            add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False)
             kwargs = {
                 "random_seed": self.cfg.seed,
                 "sequence_length": self.cfg.data.seq_length,
@@ -1508,6 +1508,8 @@ def build_train_valid_test_datasets(self):
                 "eod_mask_loss": self.eod_mask_loss,
                 "create_attention_mask": not self.get_attention_mask_from_fusion,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
+                "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True),
+                "add_extra_token_to_sequence": add_extra_token,
             }
 
             data_prefix = self.cfg.data.data_prefix