Skip to content

Commit

Permalink
Merge pull request open-mpi#7 from lrrajesh/master
Browse files Browse the repository at this point in the history
klocwork, help file changes and segfault issue resolved in job launch code.
  • Loading branch information
rhc54 committed Jun 26, 2014
2 parents 743efce + 4b67433 commit aace861
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 34 deletions.
8 changes: 5 additions & 3 deletions opal/class/opal_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,11 @@ static inline opal_object_t *opal_obj_new(opal_class_t * cls);
static inline opal_object_t *opal_obj_new_debug(opal_class_t* type, const char* file, int line)
{
opal_object_t* object = opal_obj_new(type);
object->obj_magic_id = OPAL_OBJ_MAGIC_ID;
object->cls_init_file_name = file;
object->cls_init_lineno = line;
if (object != NULL) {
object->obj_magic_id = OPAL_OBJ_MAGIC_ID;
object->cls_init_file_name = file;
object->cls_init_lineno = line;
}
return object;
}
#define OBJ_NEW(type) \
Expand Down
19 changes: 12 additions & 7 deletions orcm/mca/sst/orcmsd/sst_orcmsd.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,16 +146,21 @@ static int orcmsd_init(void)
if (ORTE_PROC_IS_HNP) {
ORTE_PROC_MY_NAME->vpid = 0;
} else {
if(NULL == mca_sst_orcmsd_component.base_vpid) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
if (NULL == mca_sst_orcmsd_component.base_vpid) {
ret = ORTE_ERR_NOT_FOUND;
error = "requires a vpid";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&ORTE_PROC_MY_NAME->vpid, mca_sst_orcmsd_component.base_vpid))) {
ORTE_ERROR_LOG(ret);
error = "convert_string_to_vpid";
goto error;
}
}
} else {
ret = ORTE_ERR_NOT_FOUND;
error = "requires a jobid";
goto error;
}

/* datastore - ensure we don't pickup the pmi component, but
Expand Down Expand Up @@ -548,8 +553,8 @@ static int orcmsd_init(void)

error:
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
orte_show_help("help-orcmsd.txt",
"orcmsd_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
}
return ret;
Expand Down Expand Up @@ -678,8 +683,8 @@ static int orcmsd_setup_node_pool(void)
return ORTE_SUCCESS;

error:
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
orte_show_help("help-orcmsd.txt",
"orcmsd_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);

return ORTE_ERR_SILENT;
Expand Down
16 changes: 7 additions & 9 deletions orcm/mca/sst/orcmsd/sst_orcmsd_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
#include "orcm/mca/sst/sst.h"
#include "orcm/mca/sst/orcmsd/sst_orcmsd.h"

char *orcm_node_regex;
char *orcm_base_jobid;
char *orcm_base_vpid;

static int component_register(void);
static int component_open(void);
static int component_close(void);
Expand Down Expand Up @@ -51,26 +47,28 @@ orcm_sst_orcmsd_component_t mca_sst_orcmsd_component = {

static int component_register(void)
{
int var_id;
mca_base_component_t *component = & mca_sst_orcmsd_component.super.base_version;
mca_sst_orcmsd_component.node_regex = NULL;
(void) mca_base_var_register ("orcm", "orcm", NULL, "node_regex",
var_id = mca_base_component_var_register (component, "node_regex",
"node_regex of allocated nodes.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_INTERNAL,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_CONSTANT,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sst_orcmsd_component.node_regex);

mca_sst_orcmsd_component.base_jobid = NULL;
(void) mca_base_var_register ("orcm", "orcm", "base", "jobid",
var_id = mca_base_component_var_register (component, "jobid",
"orcmsd base jobid.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_INTERNAL,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_CONSTANT,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sst_orcmsd_component.base_jobid);

mca_sst_orcmsd_component.base_vpid = NULL;
(void) mca_base_var_register ("orcm", "orcm", "base", "vpid",
var_id = mca_base_component_var_register (component, "vpid",
"orcmsd base vpid",
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_INTERNAL,
Expand Down
13 changes: 7 additions & 6 deletions orcm/tools/orcmd/orcmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -403,10 +403,10 @@ slm_fork_hnp_procs(orte_jobid_t jobid, uid_t uid, gid_t gid,
return ORTE_ERR_SYS_LIMITS_PIPES;
}

/* find the orted binary using the install_dirs support - this also
/* find the orcmsd binary using the install_dirs support - this also
* checks to ensure that we can see this executable and it *is* executable by us
*/
cmd = opal_path_access("orted", opal_install_dirs.bindir, X_OK);
cmd = opal_path_access("orcmsd", opal_install_dirs.bindir, X_OK);
if (NULL == cmd) {

/* guess we couldn't do it - best to abort */
Expand All @@ -419,7 +419,7 @@ slm_fork_hnp_procs(orte_jobid_t jobid, uid_t uid, gid_t gid,

/* pass it a jobid to match my job family */
opal_argv_append(&argc, &argv, "-mca");
opal_argv_append(&argc, &argv, "orcm_base_jobid");
opal_argv_append(&argc, &argv, "sst_orcmsd_jobid");
if (ORTE_SUCCESS !=
(rc = orte_util_convert_jobid_to_string(&param, jobid))) {
ORTE_ERROR_LOG(rc);
Expand All @@ -431,7 +431,7 @@ slm_fork_hnp_procs(orte_jobid_t jobid, uid_t uid, gid_t gid,
if( hnp ) {
/* setup to pass the vpid */
opal_argv_append(&argc, &argv, "-mca");
opal_argv_append(&argc, &argv, "orcm_base_vpid");
opal_argv_append(&argc, &argv, "sst_orcmsd_vpid");
opal_argv_append(&argc, &argv, "0");

/* tell the daemon it is to be the HNP */
Expand Down Expand Up @@ -463,7 +463,7 @@ slm_fork_hnp_procs(orte_jobid_t jobid, uid_t uid, gid_t gid,
} else {
/* setup to pass the vpid */
opal_argv_append(&argc, &argv, "-mca");
opal_argv_append(&argc, &argv, "orcm_base_vpid");
opal_argv_append(&argc, &argv, "sst_orcmsd_vpid");
opal_argv_append(&argc, &argv, "1");
/* pass the uri of the hnp */
asprintf(&param, "\"%s\"", hnp_uri);
Expand All @@ -479,7 +479,8 @@ slm_fork_hnp_procs(orte_jobid_t jobid, uid_t uid, gid_t gid,

/* if we have static ports, pass the node list */
if (NULL != nodes) {
opal_argv_append(&argc, &argv, "--nodes");
opal_argv_append(&argc, &argv, "-mca");
opal_argv_append(&argc, &argv, "sst_orcmsd_node_regex");
opal_argv_append(&argc, &argv, nodes);
}

Expand Down
2 changes: 2 additions & 0 deletions orcm/tools/orcmsd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# $HEADER$
#

dist_orcmdata_DATA = help-orcmsd.txt

include $(top_srcdir)/Makefile.ompi-rules

man_pages = orcmsd.1
Expand Down
46 changes: 45 additions & 1 deletion orcm/tools/orcmsd/help-orcmsd.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,49 @@ a core that does not exist on this node:
node: %s
cores: %s

The MCA param directing this behavior is orte_daemon_cores.
The MCA param directing this behavior is orcmsd_daemon_cores.
Please correct the request and try again.
#
#
[orcmsd_init:startup:internal-failure]
It looks like orcmsd_init failed for some reason; your parallel process is
likely to abort. There are many reasons that a parallel process can
fail during orcmsd_init; some of which are due to configuration or
environment problems. This failure appears to be an internal failure;
here's some additional information:

%s failed
--> Returned value %s (%d) instead of ORTE_SUCCESS
#
#
[orcmsd:session:dir:prohibited]
The specified location for the temporary directories required by orcmsd
is on the list of prohibited locations:

Location given: %s
Prohibited locations: %s

If you believe this is in error, please contact your system administrator
to have the list of prohibited locations changed. Otherwise, please identify
a different location to be used (use -h to see the cmd line option), or
simply let the system pick a default location.
#
[orcmsd:session:dir:nopwname]
orcmsd was unable to obtain the username in order to create a path
for its required temporary directories. This type of error is usually
caused by a transient failure of network-based authentication services
(e.g., LDAP or NIS failure due to network congestion), but can also be
an indication of system misconfiguration.

Please consult your system administrator about these issues and try
again.
#
[orcmsd_nidmap:too_many_nodes]
An error occurred while trying to pack the information about the job. More nodes
have been found than the %d expected. Please check your configuration files such
as the mapping.
#
[orcmsd_init:startup:num_daemons]
orcmsd was unable to determine the number of nodes in your allocation. We
are therefore assuming a very large number to ensure you receive proper error
messages.
2 changes: 1 addition & 1 deletion orcm/tools/orcmsd/orcmsd.1in
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.\" Man page contributed by Dirk Eddelbuettel <[email protected]>
.\" and released under the BSD license.
.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
.TH ORTED 1 "#OPAL_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.TH ORCMSD 1 "#OPAL_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.SH NAME
orcmsd - Start an Open RTE User-Level Daemon
.SH SYNOPSIS
Expand Down
14 changes: 11 additions & 3 deletions orcm/tools/orcmsd/orcmsd.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,15 @@ opal_cmd_line_init_t orcmsd_cmd_line_opts[] = {
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Redirect output from application processes into filename.rank" },

{ "orcm_node_regex", '\0', "nodes", "nodes", 1,
{ "sst_orcmsd_node_regex", '\0', "node_regex", "node_regex", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Regular expression defining nodes in system" },

{ "sst_orcmsd_jobid", '\0', "jobid", "jobid", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Regular expression defining nodes in system" },

{ "sst_orcmsd_vpid", '\0', "vpid", "vpid", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Regular expression defining nodes in system" },

Expand Down Expand Up @@ -301,7 +309,7 @@ int main(int argc, char *argv[])
if (orcmsd_globals.help) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(cmd_line);
orte_show_help("help-orted.txt", "orted:usage", false,
orte_show_help("help-orcmsd.txt", "orted:usage", false,
argv[0], args);
free(args);
return 1;
Expand Down Expand Up @@ -368,7 +376,7 @@ int main(int argc, char *argv[])
*/
orte_show_help_finalize();
/* the message will now come out locally */
orte_show_help("help-orted.txt", "orted:cannot-bind",
orte_show_help("help-orcmsd.txt", "orted:cannot-bind",
true, orte_process_info.nodename,
orte_daemon_cores);
ret = ORTE_ERR_NOT_SUPPORTED;
Expand Down
14 changes: 10 additions & 4 deletions orcm/tools/orun/orun.c
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,7 @@ int orun(int argc, char *argv[])
if (0 == param_len) {
orte_show_help("help-orun.txt", "orun:empty-prefix",
true, orte_basename, orte_basename);
free(param);
return ORTE_ERR_FATAL;
}
}
Expand Down Expand Up @@ -1468,9 +1469,11 @@ static int create_app(int argc, char* argv[],
*/
param = strdup(environ[i]);
value = strchr(param, '=');
*value = '\0';
value++;
opal_setenv(param, value, false, &app->env);
if (value != NULL) {
*value = '\0';
value++;
opal_setenv(param, value, false, &app->env);
}
free(param);
}
}
Expand Down Expand Up @@ -1537,7 +1540,7 @@ static int create_app(int argc, char* argv[],
free(value2);
}
} else {
opal_output(0, "Warning: could not find environment variable \"%s\"\n", param);
opal_output(0, "Warning: could not find environment variable \"%s\"\n", vars[i]);
}
}
}
Expand Down Expand Up @@ -1644,6 +1647,7 @@ static int create_app(int argc, char* argv[],
if (0 == param_len) {
orte_show_help("help-orun.txt", "orun:empty-prefix",
true, orte_basename, orte_basename);
free(param);
return ORTE_ERR_FATAL;
}
}
Expand Down Expand Up @@ -1992,6 +1996,7 @@ static int parse_appfile(orte_job_t *jdata, char *filename, char ***env)
if (NULL != *env) {
tmp_env = opal_argv_copy(*env);
if (NULL == tmp_env) {
fclose(fp);
return ORTE_ERR_OUT_OF_RESOURCE;
}
} else {
Expand All @@ -2002,6 +2007,7 @@ static int parse_appfile(orte_job_t *jdata, char *filename, char ***env)
if (ORTE_SUCCESS != rc) {
/* Assume that the error message has already been
printed; no need to cleanup -- we can just exit */
fclose(fp);
exit(1);
}
if (NULL != tmp_env) {
Expand Down

0 comments on commit aace861

Please sign in to comment.