Skip to content

Commit

Permalink
[cambricon] Fix MLU PID retrieval issue in nnodes FlagScale training
Browse files Browse the repository at this point in the history
  • Loading branch information
cifar10 committed Sep 9, 2024
1 parent 3363e47 commit 3ca556e
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions training/utils/start_task_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ def get_extern_module_dir(task_args):
return extern_module_dir


def get_mlu_pid():
'''Return the PID of the first process that contains 'MLU_VISIBLE_DEVICES'
in its command, or None if not found.
'''
import subprocess
result = subprocess.Popen(['ps', 'aux'], stdout=subprocess.PIPE, text=True)
for line in result.stdout:
if 'MLU_VISIBLE_DEVICES' in line and 'grep' not in line:
return line.split()[1]
return None


def write_pid_file(pid_file_path, pid_file):
'''Write pid file for watching the process later.
In each round of case, we will write the current pid in the same path.
Expand All @@ -62,6 +74,12 @@ def write_pid_file(pid_file_path, pid_file):
file_d.write("%s\n" % os.getpid())
file_d.close()

mlu_pid = get_mlu_pid()
if mlu_pid:
file_d = open(pid_file_path, "w")
file_d.write("%s\n" % mlu_pid)
file_d.close()


def init_flagperf_logger(logger, task_args):
'''Init the logger according to task_args, and return the log dir.'''
Expand Down

0 comments on commit 3ca556e

Please sign in to comment.