From 77bbc0ed05dbc46d61fe87e6e3386eb184e3baf6 Mon Sep 17 00:00:00 2001 From: John-Saxon Date: Tue, 23 Jan 2024 19:13:04 +0800 Subject: [PATCH] DS #4993 #662 : autotune single node hostfile bugfix --- deepspeed/autotuning/autotuner.py | 3 +++ deepspeed/autotuning/scheduler.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py index c77415beb358..dfd195bc37eb 100755 --- a/deepspeed/autotuning/autotuner.py +++ b/deepspeed/autotuning/autotuner.py @@ -683,6 +683,7 @@ def model_info_profile_run(self): exp_config[DS_CONFIG] = ds_config exp_config['num_gpus'] = self.exp_num_gpus exp_config['num_nodes'] = self.exp_num_nodes + exp_config['hostfile'] = self.args.hostfile exp_path = os.path.join(self.exps_dir, f'{exp_name}.json') with open(exp_path, 'w', buffering=BUFSIZE) as fd: @@ -761,6 +762,7 @@ def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch exp_config[DS_CONFIG] = ds_config exp_config['num_gpus'] = self.exp_num_gpus exp_config['num_nodes'] = self.exp_num_nodes + exp_config['hostfile'] = self.args.hostfile exp_path = os.path.join(self.exps_dir, f'{exp_name}.json') with open(exp_path, 'w', buffering=BUFSIZE) as fd: @@ -1055,6 +1057,7 @@ def run_ds_config(self, ds_config, exp_name): exp_config[DS_CONFIG] = ds_config exp_config['num_gpus'] = self.exp_num_gpus exp_config['num_nodes'] = self.exp_num_nodes + exp_config['hostfile'] = self.args.hostfile exp_path = os.path.join(self.exps_dir, f'{exp_name}.json') logger.debug(f'run_ds_config exp_name = {exp_name}') diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py index 40978aa00ab9..7d2a1c081da9 100755 --- a/deepspeed/autotuning/scheduler.py +++ b/deepspeed/autotuning/scheduler.py @@ -316,7 +316,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args): include_str += f"{reservation.node.host}:{slots}@" include_str = include_str[:-1] master_port = exp["master_port"] + hostfile = exp["hostfile"] exp["launcher_args"] = [ + "--hostfile", + f"{hostfile}", "--include", f"{include_str}", "--master_port",