Skip to content

Commit 6467263

Browse files
authored
Improve S6 error code/signal handling (#1825)
1 parent 09b9945 commit 6467263

File tree

4 files changed

+116
-50
lines changed

4 files changed

+116
-50
lines changed

pkg/cortex/serve/init/bootloader.sh

Lines changed: 36 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,13 @@ if [ "$CORTEX_VERSION" != "$EXPECTED_CORTEX_VERSION" ]; then
2424
exit 1
2525
fi
2626

27-
# configure log level for python scripts
28-
/opt/conda/envs/env/bin/python -c "from cortex_internal.lib import util; import os; util.expand_environment_vars_on_file(os.environ['CORTEX_LOG_CONFIG_FILE'])"
27+
function substitute_env_vars() {
28+
file_to_run_substitution=$1
29+
/opt/conda/envs/env/bin/python -c "from cortex_internal.lib import util; import os; util.expand_environment_vars_on_file('$file_to_run_substitution')"
30+
}
31+
32+
# configure log level for python scripts§
33+
substitute_env_vars $CORTEX_LOG_CONFIG_FILE
2934

3035
mkdir -p /mnt/workspace
3136
mkdir -p /mnt/requests
@@ -111,55 +116,45 @@ else
111116
rm $tempf
112117
fi
113118

114-
# good pages to read about s6-overlay used in create_s6_service and create_s6_task
115-
# https://wiki.gentoo.org/wiki/S6#Process_supervision
116-
# https://skarnet.org/software/s6/s6-svscanctl.html
117-
# http://skarnet.org/software/s6/s6-svc.html
118-
# http://skarnet.org/software/s6/servicedir.html
119-
120-
# good pages to read about execline
121-
# http://www.troubleshooters.com/linux/execline.htm
122-
# https://danyspin97.org/blog/getting-started-with-execline-scripting/
123-
124119
# only terminate pod if this process exits with non-zero exit code
125120
create_s6_service() {
126-
service_name=$1
127-
cmd=$2
121+
export SERVICE_NAME=$1
122+
export COMMAND_TO_RUN=$2
128123

129-
dest_dir="/etc/services.d/$service_name"
124+
dest_dir="/etc/services.d/$SERVICE_NAME"
130125
mkdir $dest_dir
131126

132127
dest_script="$dest_dir/run"
133-
echo "#!/usr/bin/with-contenv bash" > $dest_script
134-
echo $cmd >> $dest_script
128+
cp /src/cortex/serve/init/templates/run $dest_script
129+
substitute_env_vars $dest_script
135130
chmod +x $dest_script
136131

137132
dest_script="$dest_dir/finish"
138-
echo "#!/usr/bin/execlineb -S0" > $dest_script
139-
echo "ifelse { s6-test \${1} -ne 0 } { foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- \${1} } s6-svscanctl -t /var/run/s6/services }" >> $dest_script
140-
echo "s6-svc -O /var/run/s6/services/$service_name" >> $dest_script
133+
cp /src/cortex/serve/init/templates/finish $dest_script
134+
substitute_env_vars $dest_script
141135
chmod +x $dest_script
136+
137+
unset SERVICE_NAME
138+
unset COMMAND_TO_RUN
142139
}
143140

144-
# terminate pod if this process exits (zero or non-zero exit code)
145-
create_s6_task() {
146-
task_name=$1
147-
cmd=$2
141+
# only terminate pod if this process exits with non-zero exit code
142+
create_s6_service_from_file() {
143+
export SERVICE_NAME=$1
144+
runnable=$2
148145

149-
dest_dir="/etc/services.d/$task_name"
146+
dest_dir="/etc/services.d/$SERVICE_NAME"
150147
mkdir $dest_dir
151148

152-
dest_script="$dest_dir/run"
153-
echo "#!/usr/bin/with-contenv bash" > $dest_script
154-
echo $cmd >> $dest_script
155-
chmod +x $dest_script
149+
cp $runnable $dest_dir/run
150+
chmod +x $dest_dir/run
156151

157152
dest_script="$dest_dir/finish"
158-
echo "#!/usr/bin/execlineb -S0" > $dest_script
159-
echo "ifelse { s6-test \${1} -ne 0 } { foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- \${1} } s6-svscanctl -t /var/run/s6/services }" >> $dest_script
160-
echo "s6-svscanctl -t /var/run/s6/services" >> $dest_script
161-
153+
cp /src/cortex/serve/init/templates/finish $dest_script
154+
substitute_env_vars $dest_script
162155
chmod +x $dest_script
156+
157+
unset SERVICE_NAME
163158
}
164159

165160
# prepare webserver
@@ -168,27 +163,19 @@ if [ "$CORTEX_KIND" = "RealtimeAPI" ]; then
168163
# prepare uvicorn workers
169164
mkdir /run/uvicorn
170165
for i in $(seq 1 $CORTEX_PROCESSES_PER_REPLICA); do
171-
create_s6_service "uvicorn-$((i-1))" "cd /mnt/project && $source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/server.py /run/uvicorn/proc-$((i-1)).sock"
166+
create_s6_service "uvicorn-$((i-1))" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH exec /opt/conda/envs/env/bin/python /src/cortex/serve/start/server.py /run/uvicorn/proc-$((i-1)).sock"
172167
done
173168

174-
create_s6_service "nginx" "exec nginx -c /run/nginx.conf"
175-
176-
# prepare api readiness checker
177-
dest_dir="/etc/services.d/api_readiness"
178-
mkdir $dest_dir
179-
cp /src/cortex/serve/poll/readiness.sh $dest_dir/run
180-
chmod +x $dest_dir/run
181-
182169
# generate nginx conf
183170
/opt/conda/envs/env/bin/python -c 'from cortex_internal.lib import util; import os; generated = util.render_jinja_template("/src/cortex/serve/nginx.conf.j2", os.environ); print(generated);' > /run/nginx.conf
184171

185-
# create the python initialization service
186-
create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
187-
elif [ "$CORTEX_KIND" = "BatchAPI" ]; then
188-
create_s6_task "batch" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py"
172+
create_s6_service "py_init" "cd /mnt/project && exec /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
173+
create_s6_service "nginx" "exec nginx -c /run/nginx.conf"
174+
create_s6_service_from_file "api_readiness" "/src/cortex/serve/poll/readiness.sh"
189175

190-
# create the python initialization service
191-
create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
176+
elif [ "$CORTEX_KIND" = "BatchAPI" ]; then
177+
create_s6_service "py_init" "cd /mnt/project && exec /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
178+
create_s6_service "batch" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH exec /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py"
192179
elif [ "$CORTEX_KIND" = "TaskAPI" ]; then
193-
create_s6_task "task" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/task.py"
180+
create_s6_service "task" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH exec /opt/conda/envs/env/bin/python /src/cortex/serve/start/task.py"
194181
fi

pkg/cortex/serve/init/script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def main():
185185

186186
# exit if cron has exited with errors
187187
if cron and isinstance(cron.exitcode, int) and cron.exitcode != 0:
188-
# if it was killed by a signal
188+
# if it was killed by a catchable signal
189189
if cron.exitcode < 0:
190190
sys.exit(-cron.exitcode)
191191
sys.exit(cron.exitcode)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/execlineb -S2
2+
3+
# Copyright 2021 Cortex Labs, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# good pages to read about s6-overlay
18+
# https://wiki.gentoo.org/wiki/S6#Process_supervision
19+
# https://skarnet.org/software/s6/s6-svscanctl.html
20+
# http://skarnet.org/software/s6/s6-svc.html
21+
# http://skarnet.org/software/s6/servicedir.html
22+
23+
# good pages to read about execline
24+
# http://www.troubleshooters.com/linux/execline.htm
25+
# https://danyspin97.org/blog/getting-started-with-execline-scripting/
26+
# https://wiki.tcl-lang.org/page/execline
27+
28+
define exit_code ${1}
29+
define signal ${2}
30+
define sigterm 15
31+
32+
# when process receives a non-catchable signal (i.e. KILL/9)
33+
# s6 sets the exit code to >= 256 and expects the user to inspect the signal value instead
34+
ifelse { s6-test ${exit_code} -gt 255 } {
35+
if -n { s6-test ${signal} -eq ${sigterm} }
36+
backtick -n new_exit_code { s6-expr 128 + ${signal} }
37+
importas -ui new_exit_code new_exit_code
38+
foreground { s6-echo "[finish-manager] service ${SERVICE_NAME} received signal value ${signal}, exiting with ${new_exit_code} exit code" }
39+
foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- ${new_exit_code} }
40+
s6-svscanctl -t /var/run/s6/services
41+
}
42+
43+
# if we receive an exit code between 0 and 255, then exit accordingly with the given value
44+
ifelse { s6-test ${exit_code} -ne 0 } {
45+
foreground { s6-echo "[finish-manager] service ${SERVICE_NAME} exiting with exit code ${exit_code}" }
46+
foreground { redirfd -w 1 /var/run/s6/env-stage3/S6_STAGE2_EXITED s6-echo -n -- ${exit_code} }
47+
s6-svscanctl -t /var/run/s6/services
48+
}
49+
50+
# otherwise stop the service
51+
if { s6-test ${exit_code} -eq 0 }
52+
foreground { s6-echo "[finish-manager] service ${SERVICE_NAME} exiting with exit code 0" }
53+
foreground { s6-svc -O /var/run/s6/services/${SERVICE_NAME} }
54+
foreground { s6-rmrf /etc/services.d/${SERVICE_NAME} }
55+
56+
# stop the supervisor when all services have stopped successfully
57+
pipeline { s6-ls /etc/services.d/ }
58+
backtick -n NUM_RUNNING_SERVICES { wc -l }
59+
importas -ui NUM_RUNNING_SERVICES NUM_RUNNING_SERVICES
60+
if { s6-test ${NUM_RUNNING_SERVICES} -eq 0 }
61+
foreground { s6-echo "[finish-manager] all container services have finished; stopping supervisor" }
62+
s6-svscanctl -h /var/run/s6/services

pkg/cortex/serve/init/templates/run

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/with-contenv bash
2+
3+
# Copyright 2021 Cortex Labs, Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
${COMMAND_TO_RUN}

0 commit comments

Comments
 (0)