Skip to content

Improve S6 error code/signal handling #1825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 36 additions & 49 deletions pkg/cortex/serve/init/bootloader.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,13 @@ if [ "$CORTEX_VERSION" != "$EXPECTED_CORTEX_VERSION" ]; then
exit 1
fi

# configure log level for python scripts
/opt/conda/envs/env/bin/python -c "from cortex_internal.lib import util; import os; util.expand_environment_vars_on_file(os.environ['CORTEX_LOG_CONFIG_FILE'])"
function substitute_env_vars() {
file_to_run_substitution=$1
/opt/conda/envs/env/bin/python -c "from cortex_internal.lib import util; import os; util.expand_environment_vars_on_file('$file_to_run_substitution')"
}

# configure log level for python scripts§
substitute_env_vars $CORTEX_LOG_CONFIG_FILE

mkdir -p /mnt/workspace
mkdir -p /mnt/requests
Expand Down Expand Up @@ -111,55 +116,45 @@ else
rm $tempf
fi

# good pages to read about s6-overlay used in create_s6_service and create_s6_task
# https://wiki.gentoo.org/wiki/S6#Process_supervision
# https://skarnet.org/software/s6/s6-svscanctl.html
# http://skarnet.org/software/s6/s6-svc.html
# http://skarnet.org/software/s6/servicedir.html

# good pages to read about execline
# http://www.troubleshooters.com/linux/execline.htm
# https://danyspin97.org/blog/getting-started-with-execline-scripting/

# only terminate pod if this process exits with non-zero exit code
create_s6_service() {
service_name=$1
cmd=$2
export SERVICE_NAME=$1
export COMMAND_TO_RUN=$2

dest_dir="/etc/services.d/$service_name"
dest_dir="/etc/services.d/$SERVICE_NAME"
mkdir $dest_dir

dest_script="$dest_dir/run"
echo "#!/usr/bin/with-contenv bash" > $dest_script
echo $cmd >> $dest_script
cp /src/cortex/serve/init/templates/run $dest_script
substitute_env_vars $dest_script
chmod +x $dest_script

dest_script="$dest_dir/finish"
echo "#!/usr/bin/execlineb -S0" > $dest_script
echo "ifelse { s6-test \${1} -ne 0 } { foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- \${1} } s6-svscanctl -t /var/run/s6/services }" >> $dest_script
echo "s6-svc -O /var/run/s6/services/$service_name" >> $dest_script
cp /src/cortex/serve/init/templates/finish $dest_script
substitute_env_vars $dest_script
chmod +x $dest_script

unset SERVICE_NAME
unset COMMAND_TO_RUN
}

# terminate pod if this process exits (zero or non-zero exit code)
create_s6_task() {
task_name=$1
cmd=$2
# only terminate pod if this process exits with non-zero exit code
create_s6_service_from_file() {
export SERVICE_NAME=$1
runnable=$2

dest_dir="/etc/services.d/$task_name"
dest_dir="/etc/services.d/$SERVICE_NAME"
mkdir $dest_dir

dest_script="$dest_dir/run"
echo "#!/usr/bin/with-contenv bash" > $dest_script
echo $cmd >> $dest_script
chmod +x $dest_script
cp $runnable $dest_dir/run
chmod +x $dest_dir/run

dest_script="$dest_dir/finish"
echo "#!/usr/bin/execlineb -S0" > $dest_script
echo "ifelse { s6-test \${1} -ne 0 } { foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- \${1} } s6-svscanctl -t /var/run/s6/services }" >> $dest_script
echo "s6-svscanctl -t /var/run/s6/services" >> $dest_script

cp /src/cortex/serve/init/templates/finish $dest_script
substitute_env_vars $dest_script
chmod +x $dest_script

unset SERVICE_NAME
}

# prepare webserver
Expand All @@ -168,27 +163,19 @@ if [ "$CORTEX_KIND" = "RealtimeAPI" ]; then
# prepare uvicorn workers
mkdir /run/uvicorn
for i in $(seq 1 $CORTEX_PROCESSES_PER_REPLICA); do
create_s6_service "uvicorn-$((i-1))" "cd /mnt/project && $source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/server.py /run/uvicorn/proc-$((i-1)).sock"
create_s6_service "uvicorn-$((i-1))" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH exec /opt/conda/envs/env/bin/python /src/cortex/serve/start/server.py /run/uvicorn/proc-$((i-1)).sock"
done

create_s6_service "nginx" "exec nginx -c /run/nginx.conf"

# prepare api readiness checker
dest_dir="/etc/services.d/api_readiness"
mkdir $dest_dir
cp /src/cortex/serve/poll/readiness.sh $dest_dir/run
chmod +x $dest_dir/run

# generate nginx conf
/opt/conda/envs/env/bin/python -c 'from cortex_internal.lib import util; import os; generated = util.render_jinja_template("/src/cortex/serve/nginx.conf.j2", os.environ); print(generated);' > /run/nginx.conf

# create the python initialization service
create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
elif [ "$CORTEX_KIND" = "BatchAPI" ]; then
create_s6_task "batch" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py"
create_s6_service "py_init" "cd /mnt/project && exec /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
create_s6_service "nginx" "exec nginx -c /run/nginx.conf"
create_s6_service_from_file "api_readiness" "/src/cortex/serve/poll/readiness.sh"

# create the python initialization service
create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
elif [ "$CORTEX_KIND" = "BatchAPI" ]; then
create_s6_service "py_init" "cd /mnt/project && exec /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
create_s6_service "batch" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH exec /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py"
elif [ "$CORTEX_KIND" = "TaskAPI" ]; then
create_s6_task "task" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/task.py"
create_s6_service "task" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH exec /opt/conda/envs/env/bin/python /src/cortex/serve/start/task.py"
fi
2 changes: 1 addition & 1 deletion pkg/cortex/serve/init/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def main():

# exit if cron has exited with errors
if cron and isinstance(cron.exitcode, int) and cron.exitcode != 0:
# if it was killed by a signal
# if it was killed by a catchable signal
if cron.exitcode < 0:
sys.exit(-cron.exitcode)
sys.exit(cron.exitcode)
Expand Down
62 changes: 62 additions & 0 deletions pkg/cortex/serve/init/templates/finish
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/execlineb -S2

# Copyright 2021 Cortex Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# good pages to read about s6-overlay
# https://wiki.gentoo.org/wiki/S6#Process_supervision
# https://skarnet.org/software/s6/s6-svscanctl.html
# http://skarnet.org/software/s6/s6-svc.html
# http://skarnet.org/software/s6/servicedir.html

# good pages to read about execline
# http://www.troubleshooters.com/linux/execline.htm
# https://danyspin97.org/blog/getting-started-with-execline-scripting/
# https://wiki.tcl-lang.org/page/execline

define exit_code ${1}
define signal ${2}
define sigterm 15

# when process receives a non-catchable signal (i.e. KILL/9)
# s6 sets the exit code to >= 256 and expects the user to inspect the signal value instead
ifelse { s6-test ${exit_code} -gt 255 } {
if -n { s6-test ${signal} -eq ${sigterm} }
backtick -n new_exit_code { s6-expr 128 + ${signal} }
importas -ui new_exit_code new_exit_code
foreground { s6-echo "[finish-manager] service ${SERVICE_NAME} received signal value ${signal}, exiting with ${new_exit_code} exit code" }
foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- ${new_exit_code} }
s6-svscanctl -t /var/run/s6/services
}

# if we receive an exit code between 0 and 255, then exit accordingly with the given value
ifelse { s6-test ${exit_code} -ne 0 } {
foreground { s6-echo "[finish-manager] service ${SERVICE_NAME} exiting with exit code ${exit_code}" }
foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- ${exit_code} }
s6-svscanctl -t /var/run/s6/services
}

# otherwise stop the service
if { s6-test ${exit_code} -eq 0 }
foreground { s6-echo "[finish-manager] service ${SERVICE_NAME} exiting with exit code 0" }
foreground { s6-svc -O /var/run/s6/services/${SERVICE_NAME} }
foreground { s6-rmrf /etc/services.d/${SERVICE_NAME} }

# stop the supervisor when all services have stopped successfully
pipeline { s6-ls /etc/services.d/ }
backtick -n NUM_RUNNING_SERVICES { wc -l }
importas -ui NUM_RUNNING_SERVICES NUM_RUNNING_SERVICES
if { s6-test ${NUM_RUNNING_SERVICES} -eq 0 }
foreground { s6-echo "[finish-manager] all container services have finished; stopping supervisor" }
s6-svscanctl -h /var/run/s6/services
17 changes: 17 additions & 0 deletions pkg/cortex/serve/init/templates/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/with-contenv bash

# Copyright 2021 Cortex Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

${COMMAND_TO_RUN}