diff --git a/README.md b/README.md index f92d6c4..11b5b97 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ For single-device models, they require manual coding works to introduce tensor p At present, we pre-build distributed Bert and GPT models. For GPT, it extends to at most 175B parameters, which is called [GPT3](https://arxiv.org/abs/2005.14165). -For Bert, Google reports a [super-large Bert with 481B parameters](https://mlcommons.org/en/training-normal-11/) in MLPerf-Training v1.1 open. +For Bert, Google reports a [super-large Bert with 481B parameters](https://mlcommons.org/en/training-normal-11/) in MLPerf-Training v1.1 open, indicating that Bert can also extend to large-scale. ### Installation ``` bash diff --git a/examples/gpt/gpt.py b/examples/gpt/gpt.py index 98d657a..5b9b713 100644 --- a/examples/gpt/gpt.py +++ b/examples/gpt/gpt.py @@ -317,11 +317,11 @@ def forward(self, hidden_states=None, input_ids=None, attention_mask=None, seq_l if seq_lens is not None: hidden_states = ft_rebuild_padding(hidden_states, self.tmp_mask_offset[0:self.valid_word_num[0].item()], self.valid_word_num[0].item(), self.dim, batch_size, max_padding_size) hidden_states = self.head(self.norm(hidden_states)) - # res = [] - # for i in range(hidden_states.shape[0]): - # res.append(self.select_top_k(i, hidden_states)) - # hidden_states = torch.Tensor(res) - hidden_states = hidden_states[:, 0:1, 0:1].view(batch_size) + res = [] + for i in range(hidden_states.shape[0]): + res.append(self.select_top_k(i, hidden_states)) + hidden_states = torch.Tensor(res) + # hidden_states = hidden_states[:, 0:1, 0:1].view(batch_size) return hidden_states