Skip to content

Fix debugger operations and show_help aggregation #1480

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 19, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ompi/mca/rte/orte/rte_orte.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ OMPI_DECLSPEC void __opal_attribute_noreturn__
#define OMPI_ERROR_LOG ORTE_ERROR_LOG

/* Init and finalize objects and operations */
OMPI_DECLSPEC int ompi_rte_init(int *pargc, char ***pargv);
#define ompi_rte_init(a, b) orte_init(a, b, ORTE_PROC_MPI)
#define ompi_rte_finalize() orte_finalize()
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);

Expand Down
101 changes: 21 additions & 80 deletions ompi/mca/rte/orte/rte_orte_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,79 +52,6 @@

extern ompi_rte_orte_component_t mca_rte_orte_component;

typedef struct {
volatile bool active;
int status;
int errhandler;
} errhandler_t;

static void register_cbfunc(int status, int errhndler, void *cbdata)
{
errhandler_t *cd = (errhandler_t*)cbdata;
cd->status = status;
cd->errhandler = errhndler;
cd->active = false;
}

static volatile bool wait_for_release = true;
static int errhandler = -1;

static void notify_cbfunc(int status,
opal_list_t *procs,
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc,
void *cbdata)
{
if (NULL != cbfunc) {
cbfunc(cbdata);
}
wait_for_release = false;
}


int ompi_rte_init(int *pargc, char ***pargv)
{
int rc;
opal_list_t info;
opal_value_t val;
errhandler_t cd;

if (ORTE_SUCCESS != (rc = orte_init(pargc, pargv, ORTE_PROC_MPI))) {
return rc;
}

if (!orte_standalone_operation) {
/* register to receive any debugger release */
OBJ_CONSTRUCT(&info, opal_list_t);
OBJ_CONSTRUCT(&val, opal_value_t);
val.key = strdup(OPAL_PMIX_ERROR_NAME);
val.type = OPAL_INT;
val.data.integer = OPAL_ERR_DEBUGGER_RELEASE;
opal_list_append(&info, &val.super);
cd.status = ORTE_ERROR;
cd.errhandler = -1;
cd.active = true;

opal_pmix.register_errhandler(&info, notify_cbfunc, register_cbfunc, &cd);

/* let the MPI progress engine run while we wait for
* registration to complete */
OMPI_WAIT_FOR_COMPLETION(cd.active);
/* safely deconstruct the list */
opal_list_remove_first(&info);
OBJ_DESTRUCT(&val);
OBJ_DESTRUCT(&info);
if (OPAL_SUCCESS != cd.status) {
/* ouch - we are doomed */
ORTE_ERROR_LOG(cd.status);
return OMPI_ERROR;
}
errhandler = cd.errhandler;
}

return OMPI_SUCCESS;
}

void ompi_rte_abort(int error_code, char *fmt, ...)
{
va_list arglist;
Expand Down Expand Up @@ -173,10 +100,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
* attaching debuggers -- see big comment in
* orte/tools/orterun/debuggers.c explaining the two scenarios.
*/

void ompi_rte_wait_for_debugger(void)
{
int debugger;
orte_rml_recv_cb_t xfer;

/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
Expand All @@ -186,16 +113,16 @@ void ompi_rte_wait_for_debugger(void)
debugger = 1;
}

if (!debugger) {
if (!debugger && NULL == getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
/* if not, just return */
return;
}

/* if we are being debugged, then we need to find
* the correct plug-ins
*/
ompi_debugger_setup_dlls();

/* wait for the debugger to attach */
if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
Expand All @@ -206,9 +133,23 @@ void ompi_rte_wait_for_debugger(void)
#endif
}
} else {
/* now wait for the notification to occur */
OMPI_WAIT_FOR_COMPLETION(wait_for_release);
/* deregister the errhandler */
opal_pmix.deregister_errhandler(errhandler, NULL, NULL);
/* only the rank=0 proc waits for either a message from the
* HNP or for the debugger to attach - everyone else will just
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
* joins them.
*/
if (0 != ORTE_PROC_MY_NAME->vpid) {
return;
}

/* VPID 0 waits for a message from the HNP */
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
xfer.active = true;
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
ORTE_RML_TAG_DEBUGGER_RELEASE,
ORTE_RML_NON_PERSISTENT,
orte_rml_recv_callback, &xfer);
/* let the MPI progress engine run while we wait */
OMPI_WAIT_FOR_COMPLETION(xfer.active);
}
}
87 changes: 84 additions & 3 deletions orte/mca/ess/base/ess_base_std_app.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,14 @@
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"

#include "orte/mca/rml/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/dfs/base/base.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/qos/base/base.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/filem/base/base.h"
#include "orte/mca/errmgr/base/base.h"
Expand Down Expand Up @@ -169,14 +174,84 @@ int orte_ess_base_app_setup(bool db_restrict_local)
}
OBJ_DESTRUCT(&kv);
}

/* Setup the communication infrastructure */
/*
* OOB Layer
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_select";
goto error;
}
/* Runtime Messaging Layer */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_select";
goto error;
}
/* Messaging QoS Layer */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_qos_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_qos_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_qos_base_select";
goto error;
}
/* setup the errmgr */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_select";
goto error;
}

/* Routed system */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_select";
goto error;
}
/*
* Group communications
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_select";
goto error;
}
/* enable communication via the rml */
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml.enable_comm";
goto error;
}
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;
}
#if OPAL_ENABLE_FT_CR == 1
/*
* Setup the SnapC
Expand Down Expand Up @@ -247,7 +322,13 @@ int orte_ess_base_app_finalize(void)
(void) mca_base_framework_close(&orte_filem_base_framework);
(void) mca_base_framework_close(&orte_errmgr_base_framework);

/* now can close the rml and its friendly group comm */
(void) mca_base_framework_close(&orte_grpcomm_base_framework);
(void) mca_base_framework_close(&orte_dfs_base_framework);
(void) mca_base_framework_close(&orte_routed_base_framework);

(void) mca_base_framework_close(&orte_rml_base_framework);
(void) mca_base_framework_close(&orte_oob_base_framework);
(void) mca_base_framework_close(&orte_state_base_framework);

orte_session_dir_finalize(ORTE_PROC_MY_NAME);
Expand Down Expand Up @@ -296,7 +377,7 @@ void orte_ess_base_app_abort(int status, bool report)
* the message if routing is enabled as this indicates we
* have someone to send to
*/
if (report && orte_create_session_dirs) {
if (report && orte_routing_is_enabled && orte_create_session_dirs) {
myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
fd = open(myfile, O_CREAT, S_IRUSR);
close(fd);
Expand Down
13 changes: 13 additions & 0 deletions orte/mca/ess/pmi/ess_pmi_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include "opal/mca/pmix/base/base.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
Expand Down Expand Up @@ -85,6 +87,7 @@ static int rte_init(void)
char *envar, *ev1, *ev2;
uint64_t unique_key[2];
char *string_key;
char *rmluri;
opal_value_t *kv;
char *val;
int u32, *u32ptr;
Expand Down Expand Up @@ -379,6 +382,16 @@ static int rte_init(void)

/*** PUSH DATA FOR OTHERS TO FIND ***/

/* push our RML URI in case others need to talk directly to us */
rmluri = orte_rml.get_contact_info();
/* push it out for others to use */
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING);
if (ORTE_SUCCESS != ret) {
error = "pmix put uri";
goto error;
}
free(rmluri);

/* push our hostname so others can find us, if they need to */
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING);
if (ORTE_SUCCESS != ret) {
Expand Down
56 changes: 56 additions & 0 deletions orte/mca/oob/usock/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
# All rights reserved
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

sources = \
oob_usock_component.h \
oob_usock.h \
oob_usock_component.c \
oob_usock_connection.h \
oob_usock_sendrecv.h \
oob_usock_hdr.h \
oob_usock_peer.h \
oob_usock_ping.h \
oob_usock.c \
oob_usock_connection.c \
oob_usock_sendrecv.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_orte_oob_usock_DSO
component_noinst =
component_install = mca_oob_usock.la
else
component_noinst = libmca_oob_usock.la
component_install =
endif

mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_oob_usock_la_SOURCES = $(sources)
mca_oob_usock_la_LDFLAGS = -module -avoid-version

noinst_LTLIBRARIES = $(component_noinst)
libmca_oob_usock_la_SOURCES = $(sources)
libmca_oob_usock_la_LDFLAGS = -module -avoid-version

Loading