From 132451849572b45a51fc981f95a59551a607f83c Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 5 Jun 2024 07:13:51 -0700
Subject: [PATCH 01/12] add validation_drop_last and add_extra_token params
 support for mcore ds

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml    | 7 ++++---
 .../nlp/models/language_modeling/megatron_gpt_model.py     | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ca0c3f74e4c8..004cb540541a 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -240,9 +240,9 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: ???
+    data_prefix: []
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_impl: mmap
+    data_impl: mock
     mmap_bin_files: True
     splits_string: 900,50,50
     seq_length: ${model.encoder_seq_length}
@@ -253,6 +253,7 @@ model:
     reset_attention_mask: False # Reset attention mask after end-of-document token
     eod_mask_loss: False # Mask loss for the end of document tokens
     validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    add_extra_token: True # Option to draw sequences with one extra token to ensure the sample input tokens and sample output tokens are both of the desired sequence length
     no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
     pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a5b4450c7b44..b1b732b128d6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1497,6 +1497,8 @@ def build_train_valid_test_datasets(self):
                 "reset_attention_mask": self.reset_attention_mask,
                 "eod_mask_loss": self.eod_mask_loss,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
+                "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", False),
+                "add_extra_token_to_sequence": self.cfg.data.get("add_extra_token", False),
             }
 
             data_prefix = self.cfg.data.data_prefix

From 47f657656cab0d54612b2fe81ff119dd97fd24d1 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 6 Jun 2024 05:10:13 -0700
Subject: [PATCH 02/12] pad samples with dummy tokens only

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml  | 4 ++--
 .../nlp/data/language_modeling/megatron/data_samplers.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 004cb540541a..b9610dc89dc4 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -252,10 +252,10 @@ model:
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
     eod_mask_loss: False # Mask loss for the end of document tokens
-    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    validation_drop_last: False # Set to false if the last partial validation samples is to be consumed
     add_extra_token: True # Option to draw sequences with one extra token to ensure the sample input tokens and sample output tokens are both of the desired sequence length
     no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
-    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    pad_samples_to_global_batch_size: True # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
     exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem 
 
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
index 6818f99d0e4f..78a021f51921 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
@@ -107,7 +107,7 @@ def __iter__(self):
         indices = range(self.consumed_samples, self.total_samples)
         if (not self.drop_last) and self.pad_samples_to_global_batch_size:
             pad_samples_num = -len(indices) % self.global_batch_size
-            pad_indices = range(-1, -pad_samples_num - 1, -1)
+            pad_indices = [None for _ in range(pad_samples_num)]
             indices = chain(indices, pad_indices)
 
         for idx in indices:

From f042febaf887536bf7d02f700a33b538dda54cf6 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Thu, 6 Jun 2024 12:11:33 +0000
Subject: [PATCH 03/12] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../nlp/data/language_modeling/megatron/data_samplers.py       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
index 78a021f51921..42a28ab8f9fb 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
@@ -91,8 +91,7 @@ def __len__(self):
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
-    def __iter__(self):
-        ...
+    def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):

From 0feefafd10c5089718e6259ba57e2306a5b0be41 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 6 Jun 2024 06:32:20 -0700
Subject: [PATCH 04/12] use no_seqlen_plus_one_input_tokens as mcore's
 add_extra_token

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml   | 1 -
 .../nlp/models/language_modeling/megatron_gpt_model.py         | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index b9610dc89dc4..3e3bc44f1ee5 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -253,7 +253,6 @@ model:
     reset_attention_mask: False # Reset attention mask after end-of-document token
     eod_mask_loss: False # Mask loss for the end of document tokens
     validation_drop_last: False # Set to false if the last partial validation samples is to be consumed
-    add_extra_token: True # Option to draw sequences with one extra token to ensure the sample input tokens and sample output tokens are both of the desired sequence length
     no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
     pad_samples_to_global_batch_size: True # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index b1b732b128d6..e8358e96f555 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1488,6 +1488,7 @@ def build_train_valid_test_datasets(self):
             is_dataset_built_on_rank = lambda: True
 
             mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
+            add_extra_token = False if self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) else True
             kwargs = {
                 "random_seed": self.cfg.seed,
                 "sequence_length": self.cfg.data.seq_length,
@@ -1498,7 +1499,7 @@ def build_train_valid_test_datasets(self):
                 "eod_mask_loss": self.eod_mask_loss,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
                 "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", False),
-                "add_extra_token_to_sequence": self.cfg.data.get("add_extra_token", False),
+                "add_extra_token_to_sequence": add_extra_token,
             }
 
             data_prefix = self.cfg.data.data_prefix

From f78f0b2dc33c49384b7d3e16920577d6fbc0ef51 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 6 Jun 2024 06:35:48 -0700
Subject: [PATCH 05/12] revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../language_modeling/conf/megatron_gpt_config.yaml  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 3e3bc44f1ee5..ff7975af0a0f 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -16,7 +16,7 @@ trainer:
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
-  val_check_interval: 100
+  val_check_interval: 2000
   limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: True
+  mcore_gpt: False
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -240,9 +240,9 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: []
+    data_prefix: ???
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_impl: mock
+    data_impl: mmap
     mmap_bin_files: True
     splits_string: 900,50,50
     seq_length: ${model.encoder_seq_length}
@@ -252,9 +252,9 @@ model:
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
     eod_mask_loss: False # Mask loss for the end of document tokens
-    validation_drop_last: False # Set to false if the last partial validation samples is to be consumed
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
     no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
-    pad_samples_to_global_batch_size: True # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
     exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem 
 

From cd6f91076bcc93cf7dbe4c9e85a8e557e3486a17 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 6 Jun 2024 06:39:12 -0700
Subject: [PATCH 06/12] revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ff7975af0a0f..ca0c3f74e4c8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -16,7 +16,7 @@ trainer:
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
-  val_check_interval: 2000
+  val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models

From 5a355bec6bf9c6a157228e644642d9205e5ef562 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 09:42:24 -0700
Subject: [PATCH 07/12] set train_valid_test_num_samples[1] to None

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../language_modeling/conf/megatron_gpt_config.yaml  | 12 ++++++------
 .../models/language_modeling/megatron_gpt_model.py   |  3 +--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ca0c3f74e4c8..942d83477f03 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -14,10 +14,10 @@ trainer:
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
-  val_check_interval: 100
-  limit_val_batches: 50
+  val_check_interval: 20
+  limit_val_batches: 1.0
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
   gradient_clip_val: 1.0
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -240,11 +240,11 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: ???
+    data_prefix: {train:[1.0,/home/data/test_text_document], validation:[/home/data/test_text_document], test:[/home/data/test_text_document]}
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap
     mmap_bin_files: True
-    splits_string: 900,50,50
+    splits_string: null
     seq_length: ${model.encoder_seq_length}
     skip_warmup: True
     num_workers: 2
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index b1a2e15df681..996e15e52285 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self):
         # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below).
         # Setting N = 1 we force E to be 1 as well
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[1] = 1
-
+            train_valid_test_num_samples[1] = None
         # Add extra FIM tokens to tokenizer
         if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron':
             fim_tokens = self.cfg.data.fim.extra_tokens

From 0dee6e3e2b5b5adb45077e8104b2c177ebd16ace Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 09:48:09 -0700
Subject: [PATCH 08/12] add test case when validation_drop_last is False

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml                             | 2 ++
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml     | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 12b8cdcb8eed..c000945c13b9 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3122,6 +3122,7 @@ jobs:
         model.activations_checkpoint_method=block \
         model.activations_checkpoint_granularity=full \
         model.activations_checkpoint_num_layers=1 \
+        model.data.validation_drop_last=False \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
 
@@ -3158,6 +3159,7 @@ jobs:
         model.activations_checkpoint_method=block \
         model.activations_checkpoint_granularity=full \
         model.activations_checkpoint_num_layers=1 \
+        model.data.validation_drop_last=False \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
       AFTER_SCRIPT: |
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 942d83477f03..1e1a6f7102b4 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -17,7 +17,7 @@ trainer:
   max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 20
-  limit_val_batches: 1.0
+  limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
   gradient_clip_val: 1.0
@@ -240,11 +240,11 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: {train:[1.0,/home/data/test_text_document], validation:[/home/data/test_text_document], test:[/home/data/test_text_document]}
+    data_prefix: [1.0,/home/data/test_text_document]
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap
     mmap_bin_files: True
-    splits_string: null
+    splits_string: 900,50,50
     seq_length: ${model.encoder_seq_length}
     skip_warmup: True
     num_workers: 2

From eaa7d17671633adacd204a5f19260a27f39375e5 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 09:48:59 -0700
Subject: [PATCH 09/12] revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 1e1a6f7102b4..ca0c3f74e4c8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -14,9 +14,9 @@ trainer:
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
-  val_check_interval: 20
+  val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: True
+  mcore_gpt: False
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -240,7 +240,7 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: [1.0,/home/data/test_text_document]
+    data_prefix: ???
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap
     mmap_bin_files: True

From 10cc59a2373f9fa49067439c52c2d1739bcfdf1a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 09:50:49 -0700
Subject: [PATCH 10/12] set validation_drop_last as True by default

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 996e15e52285..a72029c812df 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1508,7 +1508,7 @@ def build_train_valid_test_datasets(self):
                 "eod_mask_loss": self.eod_mask_loss,
                 "create_attention_mask": not self.get_attention_mask_from_fusion,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
-                "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", False),
+                "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True),
                 "add_extra_token_to_sequence": add_extra_token,
             }
 

From 789624224111f2dcc7764c5c9936e8e207c35a1a Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Tue, 11 Jun 2024 15:37:38 +0300
Subject: [PATCH 11/12] Update
 nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py

Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
---
 .../nlp/data/language_modeling/megatron/data_samplers.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
index 42a28ab8f9fb..4a8b989a7b6d 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
@@ -106,7 +106,7 @@ def __iter__(self):
         indices = range(self.consumed_samples, self.total_samples)
         if (not self.drop_last) and self.pad_samples_to_global_batch_size:
             pad_samples_num = -len(indices) % self.global_batch_size
-            pad_indices = [None for _ in range(pad_samples_num)]
+            pad_indices = [None] * pad_samples_num
             indices = chain(indices, pad_indices)
 
         for idx in indices:

From 54bfda3db110508839c6508159ff1de6da84ab67 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Tue, 11 Jun 2024 16:07:35 +0300
Subject: [PATCH 12/12] Update
 nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py

Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a72029c812df..8cb8d95150c9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1497,7 +1497,7 @@ def build_train_valid_test_datasets(self):
             is_dataset_built_on_rank = lambda: True
 
             mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
-            add_extra_token = False if self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) else True
+            add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False)
             kwargs = {
                 "random_seed": self.cfg.seed,
                 "sequence_length": self.cfg.data.seq_length,