hpcaitech · MaruyamaAya · May 25, 2022 · May 25, 2022 · May 25, 2022
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ For single-device models, they require manual coding works to introduce tensor p
 
 At present, we pre-build distributed Bert and GPT models.  
 For GPT, it extends to at most 175B parameters, which is called [GPT3](https://arxiv.org/abs/2005.14165).  
-For Bert, Google reports a [super-large Bert with 481B parameters](https://mlcommons.org/en/training-normal-11/) in MLPerf-Training v1.1 open.
+For Bert, Google reports a [super-large Bert with 481B parameters](https://mlcommons.org/en/training-normal-11/) in MLPerf-Training v1.1 open, indicating that Bert can also extend to large-scale.
 
 ### Installation
 ``` bash

diff --git a/examples/gpt/gpt.py b/examples/gpt/gpt.py
@@ -317,11 +317,11 @@ def forward(self, hidden_states=None, input_ids=None, attention_mask=None, seq_l
             if seq_lens is not None:
                 hidden_states = ft_rebuild_padding(hidden_states, self.tmp_mask_offset[0:self.valid_word_num[0].item()], self.valid_word_num[0].item(), self.dim, batch_size, max_padding_size)
             hidden_states = self.head(self.norm(hidden_states))
-            # res = []
-            # for i in range(hidden_states.shape[0]):
-            #     res.append(self.select_top_k(i, hidden_states))
-            # hidden_states = torch.Tensor(res)
-            hidden_states = hidden_states[:, 0:1, 0:1].view(batch_size)
+            res = []
+            for i in range(hidden_states.shape[0]):
+                res.append(self.select_top_k(i, hidden_states))
+            hidden_states = torch.Tensor(res)
+            # hidden_states = hidden_states[:, 0:1, 0:1].view(batch_size)
         return hidden_states