Skip to content

Commit

Permalink
mcore distOpt restore fix
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
akoumpa committed Jun 25, 2024
1 parent 6ad3615 commit 55f7f35
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,10 @@ def _check_param_groups_mismatch(self, checkpoint_path: Union[str, Path], sharde
bool: True if the number of param groups does not match
"""
common_state_dict = dist_checkpointing.load_common_state_dict(checkpoint_path)
# @akoumparouli: if it contains an mcore dist opt, param_groups is under ['optimizer'].data['param_groups']
# with ['optimizer'] being a ShardedObject
if common_state_dict.get('optimizer_states', [{}])[0].get('param_groups', None) is None:
return False
model_param_groups = self._get_param_group(common_state_dict)
checkpoint_param_groups = self._get_param_group(sharded_state_dict)
return len(model_param_groups) != len(checkpoint_param_groups)
Expand Down

0 comments on commit 55f7f35

Please sign in to comment.