Skip to content

Commit

Permalink
DS deepspeedai#4993 deepspeedai#662 : autotune single node hostfile b…
Browse files Browse the repository at this point in the history
…ugfix
  • Loading branch information
John-Saxon committed Jan 24, 2024
1 parent 0dd0c61 commit 77bbc0e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 0 deletions.
3 changes: 3 additions & 0 deletions deepspeed/autotuning/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ def model_info_profile_run(self):
exp_config[DS_CONFIG] = ds_config
exp_config['num_gpus'] = self.exp_num_gpus
exp_config['num_nodes'] = self.exp_num_nodes
exp_config['hostfile'] = self.args.hostfile
exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')

with open(exp_path, 'w', buffering=BUFSIZE) as fd:
Expand Down Expand Up @@ -761,6 +762,7 @@ def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch
exp_config[DS_CONFIG] = ds_config
exp_config['num_gpus'] = self.exp_num_gpus
exp_config['num_nodes'] = self.exp_num_nodes
exp_config['hostfile'] = self.args.hostfile
exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')

with open(exp_path, 'w', buffering=BUFSIZE) as fd:
Expand Down Expand Up @@ -1055,6 +1057,7 @@ def run_ds_config(self, ds_config, exp_name):
exp_config[DS_CONFIG] = ds_config
exp_config['num_gpus'] = self.exp_num_gpus
exp_config['num_nodes'] = self.exp_num_nodes
exp_config['hostfile'] = self.args.hostfile
exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')

logger.debug(f'run_ds_config exp_name = {exp_name}')
Expand Down
3 changes: 3 additions & 0 deletions deepspeed/autotuning/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
include_str += f"{reservation.node.host}:{slots}@"
include_str = include_str[:-1]
master_port = exp["master_port"]
hostfile = exp["hostfile"]
exp["launcher_args"] = [
"--hostfile",
f"{hostfile}",
"--include",
f"{include_str}",
"--master_port",
Expand Down

0 comments on commit 77bbc0e

Please sign in to comment.