Skip to content

Commit 8ffde8d

Browse files
author
rhc54
committed
Merge pull request #1431 from rhc54/topic/orted
Do not push child processes into separate process groups so that any …
2 parents 36a6a3b + d72c1c7 commit 8ffde8d

File tree

8 files changed

+15
-75
lines changed

8 files changed

+15
-75
lines changed

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -786,10 +786,8 @@ static int rte_finalize(void)
786786
/** Remove the USR signal handlers */
787787
opal_event_signal_del(&sigusr1_handler);
788788
opal_event_signal_del(&sigusr2_handler);
789-
if (orte_forward_job_control) {
790-
opal_event_signal_del(&sigtstp_handler);
791-
opal_event_signal_del(&sigcont_handler);
792-
}
789+
opal_event_signal_del(&sigtstp_handler);
790+
opal_event_signal_del(&sigcont_handler);
793791
signals_set = false;
794792
}
795793

orte/mca/odls/alps/odls_alps_module.c

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
416416
sigset_t sigs;
417417
char *param, *msg;
418418

419-
if (orte_forward_job_control) {
420-
/* Set a new process group for this child, so that a
421-
SIGSTOP can be sent to it without being sent to the
422-
orted. */
423-
setpgid(0, 0);
424-
}
425-
426419
/* Setup the pipe to be close-on-exec */
427420
opal_fd_set_cloexec(write_fd);
428421

@@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
798791
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
799792
signal, (long)pid));
800793

801-
if (orte_forward_job_control) {
802-
/* Send the signal to the process group rather than the
803-
process. The child is the leader of its process group. */
804-
pid = -pid;
805-
}
806794
if (kill(pid, signal) != 0) {
807795
switch(errno) {
808796
case EINVAL:

orte/mca/odls/default/odls_default_module.c

Lines changed: 11 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
193193
* that occasionally causes us to incorrectly report a proc
194194
* as refusing to die. Unfortunately, errno may not be reset
195195
* by waitpid in this case, so we cannot check it.
196-
*
197-
* (note the previous fix to this, to return 'process dead'
198-
* here, fixes the race condition at the cost of reporting
199-
* all live processes have immediately died! Better to
200-
* occasionally report a dead process as still living -
201-
* which will occasionally trip the timeout for cases that
202-
* are right on the edge.)
196+
*
197+
* (note the previous fix to this, to return 'process dead'
198+
* here, fixes the race condition at the cost of reporting
199+
* all live processes have immediately died! Better to
200+
* occasionally report a dead process as still living -
201+
* which will occasionally trip the timeout for cases that
202+
* are right on the edge.)
203203
*/
204204
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
205205
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
206206
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
207-
/* Do nothing, process still alive */
207+
/* Do nothing, process still alive */
208208
} else if (-1 == ret && ECHILD == errno) {
209209
/* The pid no longer exists, so we'll call this "good
210210
enough for government work" */
@@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
228228
return false;
229229
}
230230

231+
232+
/* deliver a signal to a specified pid. */
231233
static int odls_default_kill_local(pid_t pid, int signum)
232234
{
233-
pid_t pgrp;
234-
235-
#if HAVE_SETPGID
236-
pgrp = getpgid(pid);
237-
if (-1 != pgrp) {
238-
/* target the lead process of the process
239-
* group so we ensure that the signal is
240-
* seen by all members of that group. This
241-
* ensures that the signal is seen by any
242-
* child processes our child may have
243-
* started
244-
*/
245-
pid = pgrp;
246-
}
247-
#endif
248235
if (0 != kill(pid, signum)) {
249236
if (ESRCH != errno) {
250237
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
@@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
391378
long fd, fdmax = sysconf(_SC_OPEN_MAX);
392379
char *param, *msg;
393380

394-
if (orte_forward_job_control) {
395-
/* Set a new process group for this child, so that a
396-
SIGSTOP can be sent to it without being sent to the
397-
orted. */
398-
setpgid(0, 0);
399-
}
400-
401381
/* Setup the pipe to be close-on-exec */
402382
opal_fd_set_cloexec(write_fd);
403383

@@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
720700
}
721701

722702
if (pid == 0) {
723-
close(p[0]);
724-
#if HAVE_SETPGID
725-
setpgid(0, 0);
726-
#endif
703+
close(p[0]);
727704
do_child(context, child, environ_copy, jobdat, p[1], opts);
728705
/* Does not return */
729706
}
@@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
770747
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
771748
signal, (long)pid));
772749

773-
if (orte_forward_job_control) {
774-
/* Send the signal to the process group rather than the
775-
process. The child is the leader of its process group. */
776-
pid = -pid;
777-
}
778750
if (kill(pid, signal) != 0) {
779751
switch(errno) {
780752
case EINVAL:

orte/runtime/orte_globals.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
143143
/* generate new xterm windows to display output from specified ranks */
144144
char *orte_xterm = NULL;
145145

146-
/* whether or not to forward SIGTSTP and SIGCONT signals */
147-
bool orte_forward_job_control = false;
148-
149146
/* report launch progress */
150147
bool orte_report_launch_progress = false;
151148

orte/runtime/orte_globals.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
521521
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
522522
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
523523

524-
/* whether or not to forward SIGTSTP and SIGCONT signals */
525-
ORTE_DECLSPEC extern bool orte_forward_job_control;
526-
527524
/* IOF controls */
528525
ORTE_DECLSPEC extern bool orte_tag_output;
529526
ORTE_DECLSPEC extern bool orte_timestamp_output;

orte/runtime/orte_mca_params.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -543,14 +543,6 @@ int orte_register_params(void)
543543
orte_map_stddiag_to_stderr = true;
544544
}
545545

546-
/* whether or not to forward SIGTSTP and SIGCONT signals */
547-
orte_forward_job_control = false;
548-
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
549-
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
550-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
551-
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
552-
&orte_forward_job_control);
553-
554546
/* whether or not to report launch progress */
555547
orte_report_launch_progress = false;
556548
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",

orte/tools/orte-submit/orte-submit.1in

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
11331133
all processes in the job.
11341134
.
11351135
.PP
1136-
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
1137-
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
1138-
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
1136+
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
11391137
to all of the programs started by ompi-submit and likewise a SIGCONT signal
11401138
to ompi-submit will cause a SIGCONT sent.
11411139
.

orte/tools/orterun/orterun.1in

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
12401240
all processes in the job.
12411241
.
12421242
.PP
1243-
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
1244-
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
1245-
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
1243+
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
12461244
to all of the programs started by mpirun and likewise a SIGCONT signal
12471245
to mpirun will cause a SIGCONT sent.
12481246
.

0 commit comments

Comments
 (0)