Skip to content

functionality to retry failed apps on different ranks #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 54 additions & 30 deletions turbine/code/lib/app.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,19 @@ namespace eval turbine {

proc app_init { } {
variable app_initialized
variable app_retries
variable app_retries_local
variable app_retries_reput
variable app_backoff
# Artificial random delay (seconds) just before launching each app
variable app_delay_time

if { [ info exists app_initialized ] } return

set app_initialized 1
getenv_integer TURBINE_APP_RETRIES 0 app_retries
getenv_double TURBINE_APP_DELAY 0 app_delay_time
getenv_integer TURBINE_APP_RETRY_LOCAL 0 app_retries_local
getenv_integer TURBINE_APP_RETRY_REPUT 0 app_retries_reput

getenv_double TURBINE_APP_DELAY 0 app_delay_time
if { $app_delay_time > 0 } {
if { [ adlb::rank ] == 0 } {
log "TURBINE_APP_DELAY: $app_delay_time"
Expand Down Expand Up @@ -80,32 +82,54 @@ namespace eval turbine {
incr tries
log "shell: $cmd $args $stdios"
set start [ clock milliseconds ]
if { $tcl_version >= 8.6 } {
try {
c::sync_exec $stdin_src $stdout_dst $stderr_dst $cmd {*}$args
break
} trap {TURBINE ERROR} { msg } {
# Error: try again
app_error $tries $msg $cmd $args
continue
}
variable app_retries_local
variable app_retries_reput
if { $app_retries_reput > 0 & $app_retries_local == 0 } {
if { $tries == 1 } {
app_run $stdin_src $stdout_dst $stderr_dst $cmd $args $tries $app_retries_reput
continue
}
set target_rank [ turbine::random_worker ]
set tcltmp:prio [ turbine::get_priority ]
adlb::put $target_rank 0 [ list app_run $stdin_src $stdout_dst $stderr_dst $cmd $args $tries $app_retries_reput ] ${tcltmp:prio} 1
if { $tries >= $app_retries_reput } { break }
} else {
# Tcl 8.5
if { ! [ catch { c::sync_exec $stdin_src \
$stdout_dst $stderr_dst $cmd {*}$args } \
results options ] } {
break
}
# Error: try again
app_error $tries $options $cmd {*}$args
app_run $stdin_src $stdout_dst $stderr_dst $cmd $args $tries $app_retries_local
if { $tries >= $app_retries_local } { break }
}
}
app_delay_retries $tries
} ; # End while loop

set stop [ clock milliseconds ]
set duration [ format "%0.3f" [ expr ($stop-$start)/1000.0 ] ]
log "shell command duration: $duration"
}

proc app_error { tries options cmd args } {
proc app_run { stdin stdout stderr cmd args tries total_retries } {
global tcl_version
if { $tcl_version >= 8.6 } {
try {
c::sync_exec $stdin $stdout $stderr $cmd {*}$args
return -code break
} trap {TURBINE ERROR} { results } {
# Error: try again
app_error $tries $total_retries $results $cmd $args

}
} else {
# Tcl 8.5
if { ! [ catch { c::sync_exec $stdin_src \
$stdout_dst $stderr_dst $cmd {*}$args } \
results options ] } {
return -code break
} else {
# Error: try again
app_error $tries $total_retries $options $cmd {*}$args
}
}
}

proc app_error { tries app_retries_reput options cmd args } {
global tcl_version
if { $tcl_version >= 8.6 } {
set msg $options
Expand All @@ -114,13 +138,15 @@ namespace eval turbine {
set errorinfo [ dict get $options -errorinfo ]
set msg "$errorinfo"
}
variable app_retries
set retry [ expr $tries <= $app_retries ]
# variable app_retries_reput
set retry [ expr $tries <= $app_retries_reput ]
if { ! $retry } {
turbine_error "app execution failed" on: [ c_utils::hostname ] \
"\n $msg" "\n command: $cmd $args"
rank [ adlb::rank ] "\n $msg" "\n command: $cmd $args"
} else {
log "$msg: retries: $tries/$app_retries_reput on:
[ c_utils::hostname ], rank [ adlb::rank ]"
}
app_retry $msg $tries
}

proc app_delay { } {
Expand All @@ -132,11 +158,9 @@ namespace eval turbine {
}
}

proc app_retry { msg tries } {
# Retry:
variable app_retries
proc app_delay_retries { tries } {
variable app_retries_reput
variable app_backoff
log "$msg: retries: $tries/$app_retries on: [ c_utils::hostname ]"
set delay [ expr { $app_backoff * pow(2, $tries) * rand() } ]
after [ expr round(1000 * $delay) ]
}
Expand Down
1 change: 0 additions & 1 deletion turbine/code/src/tcl/adlb/tcl-adlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1040,7 +1040,6 @@ ADLB_Put_Cmd(ClientData cdata, Tcl_Interp *interp,
DEBUG_ADLB("adlb::put: target_rank: %i type: %i \"%s\" %i",
target_rank, work_type, cmd, opts.priority);


adlb_code ac = ADLB_Put(cmd, cmd_len+1, target_rank, adlb_comm_rank,
work_type, opts);
TCL_CONDITION(ac == ADLB_SUCCESS, "ADLB_Put failed!");
Expand Down
26 changes: 10 additions & 16 deletions turbine/code/src/tcl/turbine/tcl-turbine.c
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,8 @@ rule_set_opts_default(struct rule_opts* opts,
int buffer_size)
{
opts->name = buffer;
if (action != NULL) {
if (action != NULL)
{
assert(opts->name != NULL);
rule_set_name_default(opts->name, buffer_size, action);
}
Expand Down Expand Up @@ -515,7 +516,8 @@ rule_opts_from_list(Tcl_Interp* interp, Tcl_Obj *const objv[],
objs[keypos], objs[valpos]);
TCL_CHECK(rc);
}
if (opts->name == NULL) {
if (opts->name == NULL)
{
rule_set_name_default(name_buffer, name_buffer_size, action);
opts->name = name_buffer;
}
Expand Down Expand Up @@ -1074,45 +1076,36 @@ Sync_Exec_Cmd(ClientData cdata, Tcl_Interp *interp,

pid_t child = fork();
TCL_CONDITION(child >= 0, "Error forking: %s", strerror(errno));
if (child == 0)
if (child == 0)
{
// Setup redirects
if (stdin_file[0] != '\0')
if (stdin_file[0] != '\0')
{
int in_fd = open(stdin_file, O_RDONLY);
if (in_fd == -1) redirect_error_exit(stdin_file, "input redirection");

rc = dup2(in_fd, 0);
if (rc == -1) dup2_error_exit("input redirection");

rc = close(in_fd);
if (rc == -1) close_error_exit("input redirection");
}

if (stdout_file[0] != '\0')
if (stdout_file[0] != '\0')
{
int out_fd = open(stdout_file, O_WRONLY | O_TRUNC | O_CREAT, 0666);
if (out_fd == -1) redirect_error_exit(stdin_file, "output redirection");

rc = dup2(out_fd, 1);
if (rc == -1) dup2_error_exit("output redirection");

rc = close(out_fd);
if (rc == -1) close_error_exit("output redirection");
}

if (stderr_file[0] != '\0')
if (stderr_file[0] != '\0')
{
int err_fd = open(stderr_file, O_WRONLY | O_TRUNC | O_CREAT, 0666);
if (err_fd == -1) redirect_error_exit(stdin_file, "output redirection");

rc = dup2(err_fd, 2);
if (rc == -1) dup2_error_exit("output redirection");

rc = close(err_fd);
if (rc == -1) close_error_exit("output redirection");
}

rc = execvp(cmd, cmd_argv);
TCL_CONDITION(rc != -1, "Error executing command %s: %s", cmd,
strerror(errno));
Expand Down Expand Up @@ -1709,7 +1702,8 @@ static int parse_coaster_opts(Tcl_Interp *interp, Tcl_Obj *const objv[],
const char *staging_mode_s = Tcl_GetString(value);
bool valid_staging_mode = false;
for (int i = 0; i < num_staging_modes; i++) {
if (strcmp(staging_mode_s, staging_modes[i].name) == 0) {
if (strcmp(staging_mode_s, staging_modes[i].name) == 0)
{
*staging_mode = staging_modes[i].mode;
valid_staging_mode = true;
break;
Expand Down
1 change: 1 addition & 0 deletions turbine/code/tests/adlb-iget.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ OUTPUT=${THIS%.sh}.out
export ADLB_EXHAUST_TIME=1

bin/turbine -l -n 4 ${SCRIPT} >& ${OUTPUT}

[[ ${?} == 0 ]] || test_result 1

grep -q "msg: hello" ${OUTPUT} || test_result 1
Expand Down
1 change: 1 addition & 0 deletions turbine/code/tests/adlb-steal-1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ SCRIPT=${THIS%.sh}.tcl
OUTPUT=${THIS%.sh}.out

bin/turbine -l -n 3 ${SCRIPT} >& ${OUTPUT}

[[ ${?} == 0 ]] || test_result 1


Expand Down
2 changes: 1 addition & 1 deletion turbine/code/tests/adlb-steal-1.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ if { $amserver == 0 } {
set rank [ adlb::rank ]
if { $rank == 0 } {
for { set i 0 } { $i < $PUTS } { incr i } {
adlb::put $adlb::RANK_ANY $WORK_TYPE(T) "wu-$i" 0
adlb::put $adlb::RANK_ANY $WORK_TYPE(T) "wu-$i" 0 1
}
} else {
after 5000
Expand Down
3 changes: 3 additions & 0 deletions turbine/code/tests/infinite_loop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

while true ; do date ; done
35 changes: 35 additions & 0 deletions turbine/code/tests/retry_ranks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Copyright 2013 University of Chicago and Argonne National Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

source tests/test-helpers.sh

THIS=$0
SCRIPT=${THIS%.sh}.tcl
OUTPUT=${THIS%.sh}.out
export TURBINE_APP_RETRY_REPUT=5
export TURBINE_LOG=1
export PROCS=5

source $( dirname $0 )/setup.sh > ${OUTPUT} 2>&1

./infinite_loop.sh &

set -x

bin/turbine -l -n ${PROCS} ${SCRIPT} >> ${OUTPUT} 2>&1
[[ ${?} == 0 ]] || test_result 1


test_result 0
37 changes: 37 additions & 0 deletions turbine/code/tests/retry_ranks.tcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Test for app retries when resources not available

package require turbine 1.0
namespace import turbine::*

proc main { } {
succeed
fail
vwait period
}

proc succeed { } {
global period
turbine::c::log [ list exec: **** WORKING APP CALL ***** ]
turbine::exec_external "/usr/bin/killall" [ dict create ] -9 infinite_loop.sh
}


proc fail { } {
global period
turbine::c::log [ list ********FAILING APP CALL ****** ]
after 5 {
turbine::exec_external "/usr/bin/killall" [ dict create ] -9 infinite_loop.sh
set period some_value
puts "period is: $period"
}

}


turbine::defaults
turbine::init $servers
turbine::enable_read_refcount
turbine::start main
turbine::finalize


33 changes: 33 additions & 0 deletions turbine/code/tests/retry_ranks2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
# Copyright 2013 University of Chicago and Argonne National Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

source tests/test-helpers.sh

THIS=$0
SCRIPT=${THIS%.sh}.tcl
OUTPUT=${THIS%.sh}.out
export TURBINE_APP_RETRY_REPUT=5
export TURBINE_LOG=1
export PROCS=5

source $( dirname $0 )/setup.sh > ${OUTPUT} 2>&1

set -x

bin/turbine -l -n ${PROCS} ${SCRIPT} >> ${OUTPUT} 2>&1
[[ ${?} == 0 ]] || test_result 1


test_result 0
24 changes: 24 additions & 0 deletions turbine/code/tests/retry_ranks2.tcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Test for app retries when resources not available

package require turbine 1.0
namespace import turbine::*

proc main { } {
puts "\n Creating a new test_file.txt"
turbine::exec_external "/usr/bin/touch" [ dict create ] test_file.txt

puts "\n Removing the test_file.txt - first attempt"
turbine::exec_external "/bin/rm" [ dict create ] test_file.txt
puts "\n Removing the test_file.txt - second attempt"
turbine::exec_external "/bin/rm" [ dict create ] test_file.txt

}


turbine::defaults
turbine::init $servers
turbine::enable_read_refcount
turbine::start main
turbine::finalize


1 change: 1 addition & 0 deletions turbine/code/tests/sync-exec-1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ source $( dirname $0 )/setup.sh > ${OUTPUT} 2>&1
set -x

bin/turbine -l -n ${PROCS} ${SCRIPT} >> ${OUTPUT} 2>&1

[[ ${?} == 0 ]] || test_result 1

grep -q "Hello World" ${OUTPUT} || test_result 1
Expand Down
Loading