Skip to content

Commit

Permalink
add support for new mcore ds features (#9388)
Browse files Browse the repository at this point in the history
* add validation_drop_last and add_extra_token params support for mcore ds

Signed-off-by: dimapihtar <[email protected]>

* pad samples with dummy tokens only

Signed-off-by: dimapihtar <[email protected]>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <[email protected]>

* use no_seqlen_plus_one_input_tokens as mcore's add_extra_token

Signed-off-by: dimapihtar <[email protected]>

* revert config

Signed-off-by: dimapihtar <[email protected]>

* revert config

Signed-off-by: dimapihtar <[email protected]>

* set train_valid_test_num_samples[1] to None

Signed-off-by: dimapihtar <[email protected]>

* add test case when validation_drop_last is False

Signed-off-by: dimapihtar <[email protected]>

* revert config

Signed-off-by: dimapihtar <[email protected]>

* set validation_drop_last as True by default

Signed-off-by: dimapihtar <[email protected]>

* Update nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py

Co-authored-by: jbaczek <[email protected]>
Signed-off-by: Dmytro Pykhtar <[email protected]>

* Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py

Co-authored-by: jbaczek <[email protected]>
Signed-off-by: Dmytro Pykhtar <[email protected]>

---------

Signed-off-by: dimapihtar <[email protected]>
Signed-off-by: dimapihtar <[email protected]>
Signed-off-by: Dmytro Pykhtar <[email protected]>
Co-authored-by: dimapihtar <[email protected]>
Co-authored-by: jbaczek <[email protected]>
Signed-off-by: Jan Lasek <[email protected]>
  • Loading branch information
3 people authored and janekl committed Jun 12, 2024
1 parent 259dff9 commit 23d9a41
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 5 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2398,6 +2398,7 @@ jobs:
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
Expand Down Expand Up @@ -2432,6 +2433,7 @@ jobs:
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ def __len__(self):
return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1

@abc.abstractmethod
def __iter__(self):
...
def __iter__(self): ...


class MegatronPretrainingSampler(BaseMegatronSampler):
Expand All @@ -107,7 +106,7 @@ def __iter__(self):
indices = range(self.consumed_samples, self.total_samples)
if (not self.drop_last) and self.pad_samples_to_global_batch_size:
pad_samples_num = -len(indices) % self.global_batch_size
pad_indices = range(-1, -pad_samples_num - 1, -1)
pad_indices = [None] * pad_samples_num
indices = chain(indices, pad_indices)

for idx in indices:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self):
# Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below).
# Setting N = 1 we force E to be 1 as well
if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
train_valid_test_num_samples[1] = 1

train_valid_test_num_samples[1] = None
# Add extra FIM tokens to tokenizer
if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron':
fim_tokens = self.cfg.data.fim.extra_tokens
Expand All @@ -1498,6 +1497,7 @@ def build_train_valid_test_datasets(self):
is_dataset_built_on_rank = lambda: True

mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False)
kwargs = {
"random_seed": self.cfg.seed,
"sequence_length": self.cfg.data.seq_length,
Expand All @@ -1508,6 +1508,8 @@ def build_train_valid_test_datasets(self):
"eod_mask_loss": self.eod_mask_loss,
"create_attention_mask": not self.get_attention_mask_from_fusion,
"mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
"drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True),
"add_extra_token_to_sequence": add_extra_token,
}

data_prefix = self.cfg.data.data_prefix
Expand Down

0 comments on commit 23d9a41

Please sign in to comment.