diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 01a8cfc4b0df..6cf60271e0d7 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2398,6 +2398,7 @@ jobs: model.activations_checkpoint_method=block \ model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings @@ -2432,6 +2433,7 @@ jobs: model.activations_checkpoint_method=block \ model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index 6818f99d0e4f..4a8b989a7b6d 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -91,8 +91,7 @@ def __len__(self): return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @abc.abstractmethod - def __iter__(self): - ... + def __iter__(self): ... class MegatronPretrainingSampler(BaseMegatronSampler): @@ -107,7 +106,7 @@ def __iter__(self): indices = range(self.consumed_samples, self.total_samples) if (not self.drop_last) and self.pad_samples_to_global_batch_size: pad_samples_num = -len(indices) % self.global_batch_size - pad_indices = range(-1, -pad_samples_num - 1, -1) + pad_indices = [None] * pad_samples_num indices = chain(indices, pad_indices) for idx in indices: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 718991dc203d..8cb8d95150c9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self): # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below). # Setting N = 1 we force E to be 1 as well if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): - train_valid_test_num_samples[1] = 1 - + train_valid_test_num_samples[1] = None # Add extra FIM tokens to tokenizer if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron': fim_tokens = self.cfg.data.fim.extra_tokens @@ -1498,6 +1497,7 @@ def build_train_valid_test_datasets(self): is_dataset_built_on_rank = lambda: True mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False + add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) kwargs = { "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length, @@ -1508,6 +1508,8 @@ def build_train_valid_test_datasets(self): "eod_mask_loss": self.eod_mask_loss, "create_attention_mask": not self.get_attention_mask_from_fusion, "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), + "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True), + "add_extra_token_to_sequence": add_extra_token, } data_prefix = self.cfg.data.data_prefix