From 132451849572b45a51fc981f95a59551a607f83c Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Wed, 5 Jun 2024 07:13:51 -0700 Subject: [PATCH 01/12] add validation_drop_last and add_extra_token params support for mcore ds Signed-off-by: dimapihtar --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 7 ++++--- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ca0c3f74e4c8..004cb540541a 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: False + mcore_gpt: True # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -240,9 +240,9 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: ??? + data_prefix: [] index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap + data_impl: mock mmap_bin_files: True splits_string: 900,50,50 seq_length: ${model.encoder_seq_length} @@ -253,6 +253,7 @@ model: reset_attention_mask: False # Reset attention mask after end-of-document token eod_mask_loss: False # Mask loss for the end of document tokens validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + add_extra_token: True # Option to draw sequences with one extra token to ensure the sample input tokens and sample output tokens are both of the desired sequence length no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a5b4450c7b44..b1b732b128d6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1497,6 +1497,8 @@ def build_train_valid_test_datasets(self): "reset_attention_mask": self.reset_attention_mask, "eod_mask_loss": self.eod_mask_loss, "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), + "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", False), + "add_extra_token_to_sequence": self.cfg.data.get("add_extra_token", False), } data_prefix = self.cfg.data.data_prefix From 47f657656cab0d54612b2fe81ff119dd97fd24d1 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 6 Jun 2024 05:10:13 -0700 Subject: [PATCH 02/12] pad samples with dummy tokens only Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++-- .../nlp/data/language_modeling/megatron/data_samplers.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 004cb540541a..b9610dc89dc4 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -252,10 +252,10 @@ model: reset_position_ids: False # Reset position ids after end-of-document token reset_attention_mask: False # Reset attention mask after end-of-document token eod_mask_loss: False # Mask loss for the end of document tokens - validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + validation_drop_last: False # Set to false if the last partial validation samples is to be consumed add_extra_token: True # Option to draw sequences with one extra token to ensure the sample input tokens and sample output tokens are both of the desired sequence length no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token - pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size + pad_samples_to_global_batch_size: True # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index 6818f99d0e4f..78a021f51921 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -107,7 +107,7 @@ def __iter__(self): indices = range(self.consumed_samples, self.total_samples) if (not self.drop_last) and self.pad_samples_to_global_batch_size: pad_samples_num = -len(indices) % self.global_batch_size - pad_indices = range(-1, -pad_samples_num - 1, -1) + pad_indices = [None for _ in range(pad_samples_num)] indices = chain(indices, pad_indices) for idx in indices: From f042febaf887536bf7d02f700a33b538dda54cf6 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 6 Jun 2024 12:11:33 +0000 Subject: [PATCH 03/12] Apply isort and black reformatting Signed-off-by: dimapihtar --- .../nlp/data/language_modeling/megatron/data_samplers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index 78a021f51921..42a28ab8f9fb 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -91,8 +91,7 @@ def __len__(self): return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @abc.abstractmethod - def __iter__(self): - ... + def __iter__(self): ... class MegatronPretrainingSampler(BaseMegatronSampler): From 0feefafd10c5089718e6259ba57e2306a5b0be41 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 6 Jun 2024 06:32:20 -0700 Subject: [PATCH 04/12] use no_seqlen_plus_one_input_tokens as mcore's add_extra_token Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 1 - .../nlp/models/language_modeling/megatron_gpt_model.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index b9610dc89dc4..3e3bc44f1ee5 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -253,7 +253,6 @@ model: reset_attention_mask: False # Reset attention mask after end-of-document token eod_mask_loss: False # Mask loss for the end of document tokens validation_drop_last: False # Set to false if the last partial validation samples is to be consumed - add_extra_token: True # Option to draw sequences with one extra token to ensure the sample input tokens and sample output tokens are both of the desired sequence length no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token pad_samples_to_global_batch_size: True # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b1b732b128d6..e8358e96f555 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1488,6 +1488,7 @@ def build_train_valid_test_datasets(self): is_dataset_built_on_rank = lambda: True mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False + add_extra_token = False if self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) else True kwargs = { "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length, @@ -1498,7 +1499,7 @@ def build_train_valid_test_datasets(self): "eod_mask_loss": self.eod_mask_loss, "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", False), - "add_extra_token_to_sequence": self.cfg.data.get("add_extra_token", False), + "add_extra_token_to_sequence": add_extra_token, } data_prefix = self.cfg.data.data_prefix From f78f0b2dc33c49384b7d3e16920577d6fbc0ef51 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 6 Jun 2024 06:35:48 -0700 Subject: [PATCH 05/12] revert config Signed-off-by: dimapihtar --- .../language_modeling/conf/megatron_gpt_config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 3e3bc44f1ee5..ff7975af0a0f 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -16,7 +16,7 @@ trainer: max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 - val_check_interval: 100 + val_check_interval: 2000 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: True + mcore_gpt: False # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -240,9 +240,9 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: [] + data_prefix: ??? index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mock + data_impl: mmap mmap_bin_files: True splits_string: 900,50,50 seq_length: ${model.encoder_seq_length} @@ -252,9 +252,9 @@ model: reset_position_ids: False # Reset position ids after end-of-document token reset_attention_mask: False # Reset attention mask after end-of-document token eod_mask_loss: False # Mask loss for the end of document tokens - validation_drop_last: False # Set to false if the last partial validation samples is to be consumed + validation_drop_last: True # Set to false if the last partial validation samples is to be consumed no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token - pad_samples_to_global_batch_size: True # Set to True if you want to pad the last partial batch with -1's to equal global batch size + pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem From cd6f91076bcc93cf7dbe4c9e85a8e557e3486a17 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 6 Jun 2024 06:39:12 -0700 Subject: [PATCH 06/12] revert config Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ff7975af0a0f..ca0c3f74e4c8 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -16,7 +16,7 @@ trainer: max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 - val_check_interval: 2000 + val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models From 5a355bec6bf9c6a157228e644642d9205e5ef562 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 09:42:24 -0700 Subject: [PATCH 07/12] set train_valid_test_num_samples[1] to None Signed-off-by: dimapihtar --- .../language_modeling/conf/megatron_gpt_config.yaml | 12 ++++++------ .../models/language_modeling/megatron_gpt_model.py | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ca0c3f74e4c8..942d83477f03 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -14,10 +14,10 @@ trainer: enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 + val_check_interval: 20 + limit_val_batches: 1.0 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models gradient_clip_val: 1.0 @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: False + mcore_gpt: True # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -240,11 +240,11 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: ??? + data_prefix: {train:[1.0,/home/data/test_text_document], validation:[/home/data/test_text_document], test:[/home/data/test_text_document]} index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix data_impl: mmap mmap_bin_files: True - splits_string: 900,50,50 + splits_string: null seq_length: ${model.encoder_seq_length} skip_warmup: True num_workers: 2 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b1a2e15df681..996e15e52285 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self): # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below). # Setting N = 1 we force E to be 1 as well if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): - train_valid_test_num_samples[1] = 1 - + train_valid_test_num_samples[1] = None # Add extra FIM tokens to tokenizer if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron': fim_tokens = self.cfg.data.fim.extra_tokens From 0dee6e3e2b5b5adb45077e8104b2c177ebd16ace Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 09:48:09 -0700 Subject: [PATCH 08/12] add test case when validation_drop_last is False Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 2 ++ .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 12b8cdcb8eed..c000945c13b9 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3122,6 +3122,7 @@ jobs: model.activations_checkpoint_method=block \ model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings @@ -3158,6 +3159,7 @@ jobs: model.activations_checkpoint_method=block \ model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings AFTER_SCRIPT: | diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 942d83477f03..1e1a6f7102b4 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -17,7 +17,7 @@ trainer: max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 20 - limit_val_batches: 1.0 + limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models gradient_clip_val: 1.0 @@ -240,11 +240,11 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: {train:[1.0,/home/data/test_text_document], validation:[/home/data/test_text_document], test:[/home/data/test_text_document]} + data_prefix: [1.0,/home/data/test_text_document] index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix data_impl: mmap mmap_bin_files: True - splits_string: null + splits_string: 900,50,50 seq_length: ${model.encoder_seq_length} skip_warmup: True num_workers: 2 From eaa7d17671633adacd204a5f19260a27f39375e5 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 09:48:59 -0700 Subject: [PATCH 09/12] revert config Signed-off-by: dimapihtar --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 1e1a6f7102b4..ca0c3f74e4c8 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -14,9 +14,9 @@ trainer: enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 - val_check_interval: 20 + val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: True + mcore_gpt: False # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -240,7 +240,7 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: [1.0,/home/data/test_text_document] + data_prefix: ??? index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix data_impl: mmap mmap_bin_files: True From 10cc59a2373f9fa49067439c52c2d1739bcfdf1a Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 09:50:49 -0700 Subject: [PATCH 10/12] set validation_drop_last as True by default Signed-off-by: dimapihtar --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 996e15e52285..a72029c812df 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1508,7 +1508,7 @@ def build_train_valid_test_datasets(self): "eod_mask_loss": self.eod_mask_loss, "create_attention_mask": not self.get_attention_mask_from_fusion, "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), - "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", False), + "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True), "add_extra_token_to_sequence": add_extra_token, } From 789624224111f2dcc7764c5c9936e8e207c35a1a Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:37:38 +0300 Subject: [PATCH 11/12] Update nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --- .../nlp/data/language_modeling/megatron/data_samplers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index 42a28ab8f9fb..4a8b989a7b6d 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -106,7 +106,7 @@ def __iter__(self): indices = range(self.consumed_samples, self.total_samples) if (not self.drop_last) and self.pad_samples_to_global_batch_size: pad_samples_num = -len(indices) % self.global_batch_size - pad_indices = [None for _ in range(pad_samples_num)] + pad_indices = [None] * pad_samples_num indices = chain(indices, pad_indices) for idx in indices: From 54bfda3db110508839c6508159ff1de6da84ab67 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Tue, 11 Jun 2024 16:07:35 +0300 Subject: [PATCH 12/12] Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a72029c812df..8cb8d95150c9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1497,7 +1497,7 @@ def build_train_valid_test_datasets(self): is_dataset_built_on_rank = lambda: True mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False - add_extra_token = False if self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) else True + add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) kwargs = { "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length,