|
47 | 47 | #include "sgeobj/cull/sge_all_listsL.h"
|
48 | 48 | #include "sgeobj/sge_answer.h"
|
49 | 49 | #include "sgeobj/sge_job.h"
|
| 50 | +#include "sgeobj/sge_daemonize.h" |
50 | 51 |
|
51 | 52 | #include "comm/commlib.h"
|
52 | 53 |
|
53 | 54 | #include "japi/japi.h"
|
54 | 55 | #include "japi/japiP.h"
|
55 | 56 |
|
56 | 57 | #include "gdi/sge_security.h"
|
57 |
| -#include "sgeobj/sge_daemonize.h" |
58 | 58 | #include "gdi/sge_gdi.h"
|
| 59 | +#include "gdi/ocs_gdi_ClientBase.h" |
59 | 60 |
|
60 | 61 | #include "sig_handlers.h"
|
61 | 62 | #include "basis_types.h"
|
62 | 63 | #include "usage.h"
|
63 | 64 | #include "parse_job_cull.h"
|
64 | 65 | #include "ocs_client_parse.h"
|
65 | 66 | #include "ocs_client_job.h"
|
| 67 | +#include "parse_qsub.h" |
66 | 68 | #include "msg_clients_common.h"
|
67 | 69 | #include "msg_qsub.h"
|
68 | 70 | #include "msg_qmaster.h"
|
@@ -93,7 +95,7 @@ main(int argc, const char **argv)
|
93 | 95 | int exit_status = 0;
|
94 | 96 | int just_verify;
|
95 | 97 | int tmp_ret;
|
96 |
| - int wait_for_job = 0, is_immediate = 0; |
| 98 | + int is_immediate = 0; |
97 | 99 | dstring session_key_out = DSTRING_INIT;
|
98 | 100 | dstring diag = DSTRING_INIT;
|
99 | 101 | dstring jobid = DSTRING_INIT;
|
@@ -208,9 +210,12 @@ main(int argc, const char **argv)
|
208 | 210 | /* If "-sync y" is set, wait for the job to end. */
|
209 | 211 | /* Remove all -sync switches since cull_parse_job_parameter()
|
210 | 212 | * doesn't know what to do with them. */
|
| 213 | + bool wait_for_job = false; |
| 214 | + u_long32 sync_opt = SYNC_NO; |
211 | 215 | while ((ep = lGetElemStrRW(opts_all, SPA_switch_val, "-sync"))) {
|
212 |
| - if (lGetInt(ep, SPA_argval_lIntT) == TRUE) { |
213 |
| - wait_for_job = 1; |
| 216 | + sync_opt = lGetUlong(ep, SPA_argval_lUlongT); |
| 217 | + if (sync_opt != SYNC_NO) { |
| 218 | + wait_for_job = true; |
214 | 219 | }
|
215 | 220 |
|
216 | 221 | lRemoveElem(opts_all, &ep);
|
@@ -416,39 +421,53 @@ main(int argc, const char **argv)
|
416 | 421 | }
|
417 | 422 | }
|
418 | 423 |
|
| 424 | + // We have to wait for certain job states |
419 | 425 | if (wait_for_job) {
|
420 |
| - /* Rather than using japi_synchronize on ALL for bulk jobs, we use |
421 |
| - * japi_wait on ANY num_tasks times because with synchronize, we would |
422 |
| - * have to wait for all the tasks to finish before we know if any |
423 |
| - * finished. */ |
424 |
| - for (count = 0; count < num_tasks; count++) { |
425 |
| - /* Since there's only one running job in the session, we can just |
426 |
| - * wait for ANY. */ |
427 |
| - if ((tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, |
428 |
| - DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event, |
429 |
| - nullptr, &diag)) != DRMAA_ERRNO_SUCCESS) { |
430 |
| - if ((tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT) && |
431 |
| - (tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION)) { |
| 426 | + |
| 427 | + // JOB START: just wait for the first task to start |
| 428 | + if ((sync_opt & SYNC_JOB_START) == SYNC_JOB_START) { |
| 429 | + tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_START, &event, nullptr, &diag); |
| 430 | + |
| 431 | + if (tmp_ret != DRMAA_ERRNO_SUCCESS) { |
| 432 | + if (tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT && tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) { |
432 | 433 | fprintf(stderr, "\n");
|
433 | 434 | fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag));
|
434 | 435 | fprintf(stderr, "\n");
|
435 | 436 | }
|
436 | 437 |
|
437 | 438 | exit_status = 1;
|
438 | 439 | goto Error;
|
| 440 | + } else { |
| 441 | + printf(MSG_QSUB_JOBHASSTARTED_S, sge_dstring_get_string(&jobid)); |
439 | 442 | }
|
| 443 | + } |
440 | 444 |
|
441 |
| - /* report how job finished */ |
442 |
| - /* If the job is an array job, use the first non-zero exit code as |
443 |
| - * the exit code for qsub. */ |
444 |
| - if (exit_status == 0) { |
445 |
| - exit_status = report_exit_status(stat, |
446 |
| - sge_dstring_get_string(&jobid)); |
447 |
| - } |
448 |
| - /* If we've already found a non-zero exit code, just print the exit |
449 |
| - * info for the task. */ |
450 |
| - else { |
451 |
| - report_exit_status(stat, sge_dstring_get_string(&jobid)); |
| 445 | + // JOB END: Now wait for the end of *all* tasks |
| 446 | + if ((sync_opt & SYNC_JOB_END) == SYNC_JOB_END) { |
| 447 | + for (count = 0; count < num_tasks; count++) { |
| 448 | + // Rather than using japi_synchronize on ALL for bulk jobs, we use japi_wait on ANY num_tasks times because with synchronize, we would |
| 449 | + // have to wait for all the tasks to finish before we know if any finished. |
| 450 | + // Since there's only one running job in the session, we can just wait for ANY. |
| 451 | + tmp_ret = japi_wait(DRMAA_JOB_IDS_SESSION_ANY, &jobid, &stat, DRMAA_TIMEOUT_WAIT_FOREVER, JAPI_JOB_FINISH, &event, nullptr, &diag); |
| 452 | + if (tmp_ret != DRMAA_ERRNO_SUCCESS) { |
| 453 | + if (tmp_ret != DRMAA_ERRNO_EXIT_TIMEOUT && tmp_ret != DRMAA_ERRNO_NO_ACTIVE_SESSION) { |
| 454 | + fprintf(stderr, "\n"); |
| 455 | + fprintf(stderr, MSG_QSUB_COULDNOTWAITFORJOB_S, sge_dstring_get_string(&diag)); |
| 456 | + fprintf(stderr, "\n"); |
| 457 | + } |
| 458 | + |
| 459 | + exit_status = 1; |
| 460 | + goto Error; |
| 461 | + } |
| 462 | + |
| 463 | + // report how job finished |
| 464 | + if (exit_status == 0) { |
| 465 | + // If the job is an array job, use the first non-zero exit code as the exit code for qsub. |
| 466 | + exit_status = report_exit_status(stat, sge_dstring_get_string(&jobid)); |
| 467 | + } else { |
| 468 | + // If we've already found a non-zero exit code, just print the exit info for the task. |
| 469 | + report_exit_status(stat, sge_dstring_get_string(&jobid)); |
| 470 | + } |
452 | 471 | }
|
453 | 472 | }
|
454 | 473 | }
|
|
0 commit comments