Skip to content

Commit d7a33b8

Browse files
committed
EH: CS-777: Improve qsub -sync so that is support multiple state (like r)
1 parent e5bfe03 commit d7a33b8

File tree

6 files changed

+158
-51
lines changed

6 files changed

+158
-51
lines changed

doc/markdown/man/man1/submit.include.md

+28-13
Original file line numberDiff line numberDiff line change
@@ -1090,26 +1090,41 @@ defined JSV instances as parameter with the names *global_q_soft* and *global_l_
10901090
Find more information in the sections describing *-q*, *-l* and *-scope*. (see *-jsv* option below or find more
10911091
information concerning JSV in xxqs_name_sxx_jsv(1))
10921092

1093-
## -sync y\[es\]\|n\[o\]
1093+
## -sync r|x|n (or y\[es\]\|n\[o\])
10941094

10951095
Available for `qsub`.
10961096

1097-
`-sync y` causes `qsub` to wait for the job to complete before exiting. If the job completes successfully,
1097+
The `-sync` option *yes* and *no* are deprecated. Use *r*, *x* and *n* instead. `qsub -sync yes` is equivalent to
1098+
`qsub -sync x` and `qsub -sync no` is equivalent to `qsub -sync n`.
1099+
1100+
`-sync` causes `qsub` to wait for one or multiple job states to be reached before exiting. The option argument
1101+
specifies the job state(s) to wait for.
1102+
1103+
If `-sync r` is used `qsub` will wait for the job to leave the *qw* state before it terminates.
1104+
If `-sync r` is used in conjunction with `-t n[-m[:s]]`, `qsub` will wait for at least one task of the array job to
1105+
leave the queues-waiting state. A task is considered to leave the queues-waiting state if it enters the running
1106+
state or if it is deleted before start.
1107+
1108+
`-sync x` causes `qsub` to wait for the job to complete before exiting. If the job completes successfully,
10981109
The exit code of `qsub` will be that of the completed job. If the job fails to complete successfully,
10991110
`qsub` will print out an error message indicating why the job failed and will have an exit code of 1. If `qsub`
1100-
is interrupted, e.g. with CTRL-C, before the job completes, the job will be canceled. With the `-sync n` option,
1101-
`qsub` will exit with an exit code of 0 as soon as the job is submitted successfully. `-sync n` is default for
1102-
`qsub`.
1103-
1104-
If `-sync y` is used in conjunction with `-now y`, `qsub` will behave as though only `-now y` were given until the
1105-
job has been successfully scheduled, after which time `qsub` will behave as though only `-sync y` were given.
1106-
If `-sync y` is used in conjunction with `-t n[-m[:s]]`, `qsub` will wait for all the job's tasks to complete
1107-
before exiting. If all the job's tasks complete successfully, The exit code ob `qsub` will be that of the first
1108-
completed job tasks with a non-zero exit code, or 0 if all job tasks exited with an exit code of 0. If any of the
1109-
job's tasks fail to complete successfully, `qsub` will print out an error message indicating why the job task(s)
1110-
failed and will have an exit code of 1. If `qsub` is interrupted, e.g. with CTRL-C, before the job completes,
1111+
is interrupted, e.g. with CTRL-C, before the job completes, the job will be canceled.
1112+
1113+
If `-sync x` is used in conjunction with `-t n[-m[:s]]`, `qsub` will wait for all the job's tasks to complete
1114+
before exiting. If all the job's tasks complete successfully, The exit code ob `qsub` will be that of the first
1115+
completed job tasks with a non-zero exit code, or 0 if all job tasks exited with an exit code of 0. If any of the
1116+
job's tasks fail to complete successfully, `qsub` will print out an error message indicating why the job task(s)
1117+
failed and will have an exit code of 1. If `qsub` is interrupted, e.g. with CTRL-C, before the job completes,
11111118
all the job's tasks will be canceled.
11121119

1120+
With the `-sync n` option (default if nothing else is specified), `qsub` will exit with an exit code of 0 as soon as
1121+
the job is submitted successfully.
1122+
1123+
If `-sync rx` is used, `qsub` will behave as though only `-sync r` were given until the job has been started,
1124+
after which time `qsub` will behave as though only `-sync x` were given. If `-sync ...` is used in conjunction with
1125+
`-now y`, `qsub` will behave as though only `-now y` were given until the job has been successfully scheduled,
1126+
after which time `qsub` will behave as though only `-sync ...` were given.
1127+
11131128
Information that this switch was specified during submission is not available in the JSV context. (see `-jsv` option
11141129
above or find more information concerning JSV in xxqs_name_sxx_jsv(1))
11151130

source/clients/qsub/msg_qsub.h

+1
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,6 @@
5555
#define MSG_QSUB_INTERRUPTED _MESSAGE(210015, _("Interrupted!"))
5656
#define MSG_QSUB_TERMINATING _MESSAGE(210016, _("Please wait while qsub shuts down."))
5757
#define MSG_QSUB_COULDNOTREADSCRIPT_S _MESSAGE(210017, _("Unable to read script file because of error: "))
58+
#define MSG_QSUB_JOBHASSTARTED_S _MESSAGE(210018, _("Job " SFN " has started.\n"))
5859

5960
// clang-format on

source/clients/qsub/ocs_qsub.cc

+46-27
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,24 @@
4747
#include "sgeobj/cull/sge_all_listsL.h"
4848
#include "sgeobj/sge_answer.h"
4949
#include "sgeobj/sge_job.h"
50+
#include "sgeobj/sge_daemonize.h"
5051

5152
#include "comm/commlib.h"
5253

5354
#include "japi/japi.h"
5455
#include "japi/japiP.h"
5556

5657
#include "gdi/sge_security.h"
57-
#include "sgeobj/sge_daemonize.h"
5858
#include "gdi/sge_gdi.h"
59+
#include "gdi/ocs_gdi_ClientBase.h"
5960

6061
#include "sig_handlers.h"
6162
#include "basis_types.h"
6263
#include "usage.h"
6364
#include "parse_job_cull.h"
6465
#include "ocs_client_parse.h"
6566
#include "ocs_client_job.h"
67+
#include "parse_qsub.h"
6668
#include "msg_clients_common.h"
6769
#include "msg_qsub.h"
6870
#include "msg_qmaster.h"
@@ -93,7 +95,7 @@ main(int argc, const char **argv)
9395
int exit_status = 0;
9496
int just_verify;
9597
int tmp_ret;
96-
int wait_for_job = 0, is_immediate = 0;
98+
int is_immediate = 0;
9799
dstring session_key_out = DSTRING_INIT;
98100
dstring diag = DSTRING_INIT;
99101
dstring jobid = DSTRING_INIT;
@@ -208,9 +210,12 @@ main(int argc, const char **argv)
208210
/* If "-sync y" is set, wait for the job to end. */
209211
/* Remove all -sync switches since cull_parse_job_parameter()
210212
* doesn't know what to do with them. */
213+
bool wait_for_job = false;
214+
u_long32 sync_opt = SYNC_NO;
211215
while ((ep = lGetElemStrRW(opts_all, SPA_switch_val, "-sync"))) {
212-
if (lGetInt(ep, SPA_argval_lIntT) == TRUE) {
213-
wait_for_job = 1;
216+
sync_opt = lGetUlong(ep, SPA_argval_lUlongT);
217+
if (sync_opt != SYNC_NO) {
218+
wait_for_job = true;
214219
}
215220

216221
lRemoveElem(opts_all, &ep);
@@ -416,39 +421,53 @@ main(int argc, const char **argv)
416421
}
417422
}
418423

424+
// We have to wait for certain job states
419425
if (wait_for_job) {
420-
/* Rather than using japi_synchronize on ALL for bulk jobs, we use
421-
* japi_wait on ANY num_tasks times because with synchronize, we would
422-
* have to wait for all the tasks to finish before we know if any
423-
* finished. */
424-
for (count = 0; count < num_tasks; count++) {
425-
/* Since there's only one running job in the session, we can just
426-
* wait for ANY. */
427-
if ((tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat,
428-
DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event,
429-
nullptr, &diag)) != DRMAA_ERRNO_SUCCESS) {
430-
if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) &&
431-
(tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) {
426+
427+
// JOB START: just wait for the first task to start
428+
if ((sync_opt & SYNC_JOB_START) == SYNC_JOB_START) {
429+
tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_START, &event, nullptr, &diag);
430+
431+
if (tmp_ret != DRMAA_ERRNO_SUCCESS) {
432+
if (tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT && tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) {
432433
fprintf(stderr, "\n");
433434
fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag));
434435
fprintf(stderr, "\n");
435436
}
436437

437438
exit_status = 1;
438439
goto Error;
440+
} else {
441+
printf(MSG_QSUB_JOBHASSTARTED_S, sge_dstring_get_string(&jobid));
439442
}
443+
}
440444

441-
/* report how job finished */
442-
/* If the job is an array job, use the first non-zero exit code as
443-
* the exit code for qsub. */
444-
if (exit_status == 0) {
445-
exit_status = report_exit_status(stat,
446-
sge_dstring_get_string(&jobid));
447-
}
448-
/* If we've already found a non-zero exit code, just print the exit
449-
* info for the task. */
450-
else {
451-
report_exit_status(stat, sge_dstring_get_string(&jobid));
445+
// JOB END: Now wait for the end of *all* tasks
446+
if ((sync_opt & SYNC_JOB_END) == SYNC_JOB_END) {
447+
for (count = 0; count < num_tasks; count++) {
448+
// Rather than using japi_synchronize on ALL for bulk jobs, we use japi_wait on ANY num_tasks times because with synchronize, we would
449+
// have to wait for all the tasks to finish before we know if any finished.
450+
// Since there's only one running job in the session, we can just wait for ANY.
451+
tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event, nullptr, &diag);
452+
if (tmp_ret != DRMAA_ERRNO_SUCCESS) {
453+
if (tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT && tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) {
454+
fprintf(stderr, "\n");
455+
fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag));
456+
fprintf(stderr, "\n");
457+
}
458+
459+
exit_status = 1;
460+
goto Error;
461+
}
462+
463+
// report how job finished
464+
if (exit_status == 0) {
465+
// If the job is an array job, use the first non-zero exit code as the exit code for qsub.
466+
exit_status = report_exit_status(stat, sge_dstring_get_string(&jobid));
467+
} else {
468+
// If we've already found a non-zero exit code, just print the exit info for the task.
469+
report_exit_status(stat, sge_dstring_get_string(&jobid));
470+
}
452471
}
453472
}
454473
}

source/common/msg_common.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -851,7 +851,7 @@
851851
#define MSG_GDI_USAGE_Aattr_OPT "[-Aattr obj_nm fname obj_id_list]"
852852
#define MSG_GDI_UTEXT_Aattr_OPT _MESSAGE(23483, _("add to a list attribute of an object"))
853853

854-
#define MSG_GDI_USAGE_sync_OPT_YN "[-sync y[es]|n[o]]"
854+
#define MSG_GDI_USAGE_sync_OPT_YN "[-sync r|x|n]"
855855
#define MSG_GDI_UTEXT_sync_OPT_YN _MESSAGE(23484, _("wait for job to end and return exit code"))
856856

857857
#define MSG_GDI_USAGE_JQ_DEST_OPR "job_queue_list"

source/common/parse_qsub.cc

+75-10
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@
6868

6969
static int var_list_parse_from_environment(lList **lpp, char **envp);
7070
static int sge_parse_checkpoint_interval(const char *time_str);
71-
static int set_yn_option (lList **opts, u_long32 opt, const char *arg, const char *value,
72-
lList **alpp);
73-
71+
static int set_yn_option (lList **opts, u_long32 opt, const char *arg, const char *value, lList **alpp);
72+
static int ocs_parse_sync_switch(lList **opts, u_long32 opt, const char *arg, const char *value, lList **alpp);
7473

7574

7675
/*
@@ -1571,23 +1570,21 @@ DTRACE;
15711570

15721571
/*----------------------------------------------------------------------------*/
15731572
/* -sync y[es]|n[o] */
1573+
/* -sync r|x|n */
15741574

1575-
if(!strcmp("-sync", *sp)) {
1575+
if (!strcmp("-sync", *sp)) {
15761576
if (lGetElemStr(*pcmdline, SPA_switch_val, *sp)) {
1577-
answer_list_add_sprintf(&answer, STATUS_EEXIST, ANSWER_QUALITY_WARNING,
1578-
MSG_PARSE_XOPTIONALREADYSETOVERWRITINGSETING_S, *sp );
1577+
answer_list_add_sprintf(&answer, STATUS_EEXIST, ANSWER_QUALITY_WARNING, MSG_PARSE_XOPTIONALREADYSETOVERWRITINGSETING_S, *sp);
15791578
}
1580-
/* next field is yes/no switch */
15811579
sp++;
15821580
if(!*sp) {
1583-
answer_list_add_sprintf(&answer, STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR,
1584-
MSG_PARSE_XOPTIONMUSTHAVEARGUMENT_S, "-sync" );
1581+
answer_list_add_sprintf(&answer, STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR, MSG_PARSE_XOPTIONMUSTHAVEARGUMENT_S, "-sync");
15851582
DRETURN(answer);
15861583
}
15871584

15881585
DPRINTF("\"-sync %s\"\n", *sp);
15891586

1590-
if (set_yn_option (pcmdline, sync_OPT, *(sp - 1), *sp, &answer) != STATUS_OK) {
1587+
if (ocs_parse_sync_switch(pcmdline, sync_OPT, *(sp - 1), *sp, &answer) != STATUS_OK) {
15911588
DRETURN(answer);
15921589
}
15931590

@@ -2222,6 +2219,74 @@ static int set_yn_option (lList **opts, u_long32 opt, const char *arg, const cha
22222219
return STATUS_OK;
22232220
}
22242221

2222+
/** @brief Parse the -sync option switch
2223+
*
2224+
* Supports the old and new format of the -sync option switch.
2225+
* Old format: -sync y(es)|n(o)
2226+
* New format: -sync r|y|x|E|n
2227+
*
2228+
* r - job start
2229+
* x or y - job end
2230+
* n - no sync
2231+
*
2232+
* @param opts list of options
2233+
* @param opt option code
2234+
* @param arg option argument
2235+
* @param value option value
2236+
* @param alpp answer list
2237+
*
2238+
* @return STATUS_OK if success, STATUS_ERROR1 otherwise
2239+
*/
2240+
static int
2241+
ocs_parse_sync_switch(lList **opts, u_long32 opt, const char *arg, const char *value, lList **alpp) {
2242+
DENTER(TOP_LAYER);
2243+
u_long32 sync_bits = SYNC_UNINITIALIZED;
2244+
2245+
// -sync yes|no (pre OCS format )
2246+
if (strcasecmp("yes", value) == 0) {
2247+
sync_bits = SYNC_JOB_END;
2248+
} else if (strcasecmp ("no", value) == 0) {
2249+
sync_bits = SYNC_NO;
2250+
}
2251+
2252+
// -sync r|y|x|E|n (new OCS format)
2253+
if (sync_bits == SYNC_UNINITIALIZED) {
2254+
const int len = strlen(value);
2255+
for (int pos = 0; pos < len; pos++) {
2256+
switch (value[pos]) {
2257+
case 'r':
2258+
sync_bits |= SYNC_JOB_START;
2259+
break;
2260+
case 'x':
2261+
case 'y':
2262+
sync_bits |= SYNC_JOB_END;
2263+
break;
2264+
case 'n':
2265+
sync_bits |= SYNC_NO;
2266+
break;
2267+
default:
2268+
snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_PARSE_INVALIDOPTIONARGUMENT_SS, arg, value);
2269+
answer_list_add(alpp, SGE_EVENT, STATUS_ESYNTAX, ANSWER_QUALITY_ERROR);
2270+
DRETURN(STATUS_ERROR1);
2271+
}
2272+
}
2273+
}
2274+
2275+
// check if SYNC_NO was combined with other bits which might make no sense
2276+
if ((sync_bits & SYNC_NO) != 0 && sync_bits != SYNC_NO) {
2277+
snprintf(SGE_EVENT, SGE_EVENT_SIZE, MSG_PARSE_INVALIDOPTIONARGUMENT_SS, arg, value);
2278+
answer_list_add(alpp, SGE_EVENT, STATUS_ESYNTAX, ANSWER_QUALITY_ERROR);
2279+
DRETURN(STATUS_ERROR1);
2280+
}
2281+
2282+
// transport bits via opts list
2283+
lListElem *ep_opt = nullptr;
2284+
ep_opt = sge_add_arg(opts, opt, lUlongT, arg, value);
2285+
lSetUlong(ep_opt,SPA_argval_lUlongT, sync_bits);
2286+
2287+
DRETURN(STATUS_OK);
2288+
}
2289+
22252290
/* This method is not thread safe. Fortunately, it is only used by the
22262291
* -cwd switch which can be forbidden in DRMAA. */
22272292
char *reroot_path(lListElem* pjob, const char *path, lList **alpp) {

source/common/parse_qsub.h

+7
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@
5858
#define FLG_USE_PSEUDOS 1
5959
#define FLG_QALTER 2
6060

61+
typedef enum {
62+
SYNC_UNINITIALIZED = 0x00000000,
63+
SYNC_NO = 0x00000001,
64+
SYNC_JOB_END = 0x00000002,
65+
SYNC_JOB_START = 0x00000004,
66+
} sync_switch_t;
67+
6168
/* I've added a -wd option to cull_parse_job_parameter() to deal with the
6269
* DRMAA_WD attribute. It makes sense to me that since -wd exists and is
6370
* handled by cull_parse_job_parameter() that -cwd should just become an alias

0 commit comments

Comments
 (0)