Skip to content

Commit

Permalink
orte: use a unique top_session_dir directory when possible.
Browse files Browse the repository at this point in the history
currently, top_session_dir is based on hostname, uid, HNP pid and job family.
there is a risk a top_session_dir exists and contains some old data when
a job starts, leading to undefined behavior.

if the app is started via mpirun, use a unique top_session_dir
mkdtemp("$TMP/ompi.<hostname>.<uid>/XXXXXX") that is passed to fork'ed MPI tasks
via the OPAL_MCA_PREFIX"orte_top_session_dir" environment variable.

if the app is direct launched, then the current behavior is unchanged.
direct launch behavior will be enhanced when PMIx is able to pass
a per-node directory (PMIX_NSDIR ?) to a direct launched task.
  • Loading branch information
ggouaillardet committed Sep 14, 2017
1 parent 8d336dd commit 13932a3
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 2 deletions.
5 changes: 5 additions & 0 deletions orte/orted/orted_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,11 @@ int orte_daemon(int argc, char *argv[])
opal_argv_append_nosize(&singenv, env_str);
free(env_str);

/* append the top session dir to the envars needed by the singleton */
asprintf(&env_str, OPAL_MCA_PREFIX"orte_top_session_dir=%s", orte_process_info.top_session_dir);
opal_argv_append_nosize(&singenv, env_str);
free(env_str);

nptr = opal_argv_join(singenv, '*');
opal_argv_free(singenv);

Expand Down
33 changes: 31 additions & 2 deletions orte/util/session_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -134,13 +134,24 @@ static int _setup_tmpdir_base(void)
int orte_setup_top_session_dir(void)
{
int rc = ORTE_SUCCESS;
char *env;
/* get the effective uid */
uid_t uid = geteuid();


/* construct the top_session_dir if we need */
if (NULL == orte_process_info.top_session_dir) {
env = getenv(OPAL_MCA_PREFIX"orte_top_session_dir");
if (NULL != env) {
orte_process_info.tmpdir_base = strdup(env);
orte_process_info.top_session_dir = strdup(env);
return ORTE_SUCCESS;
}

assert(!ORTE_PROC_IS_APP || (NULL == getenv(OPAL_MCA_PREFIX"orte_launch")));

if (ORTE_SUCCESS != (rc = _setup_tmpdir_base())) {
return rc;
goto exit;
}
if( NULL == orte_process_info.nodename ||
NULL == orte_process_info.tmpdir_base ){
Expand All @@ -156,6 +167,24 @@ int orte_setup_top_session_dir(void)
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto exit;
}
if (!ORTE_PROC_IS_APP) {
char *dir = orte_process_info.top_session_dir;
if (ORTE_SUCCESS != (rc = orte_create_dir(dir))) {
goto exit;
}
if (0 > asprintf(&orte_process_info.top_session_dir, "%s/XXXXXX", dir)) {
free(dir);
orte_process_info.top_session_dir = NULL;
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto exit;
}
free(dir);

if (NULL == mkdtemp(orte_process_info.top_session_dir)) {
rc = ORTE_ERROR;
goto exit;
}
}
}
exit:
if( ORTE_SUCCESS != rc ){
Expand Down

0 comments on commit 13932a3

Please sign in to comment.