forked from yugabyte/yugabyte-db
-
Notifications
You must be signed in to change notification settings - Fork 0
/
k8s_parent.py
executable file
·201 lines (169 loc) · 6.38 KB
/
k8s_parent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env python
import errno
import logging
import os
import shutil
import signal
import subprocess
import sys
from glob import glob
child_process = None
# Set of signals which are propagated to the child process from this
# script. Aim here is to make sure we propagate at least the ones
# which container runtime is supposed to send us. Explanation for some
# of the signals which are not handled/propagated:
#
# SIGCHLD: This script is supposed to be run inside Tini which takes
# care of adopting the child processes, so we leave that
# responsibility to Tini itself.
#
# SIGKILL:
# SIGSTOP: These cannot be blocked/handled.
#
# SIGTTIN:
# SIGTTOU:
# SIGFPE :
# SIGILL :
# SIGSEGV:
# SIGBUS :
# SIGABRT:
# SIGTRAP:
# SIGSYS : Tini does not propagate these signals to child process:
# https://github.com/krallin/tini/blob/378bbbc8909/src/tini.c#L465
PROPAGATE_SIGS = {
signal.SIGTERM,
signal.SIGHUP,
signal.SIGINT,
signal.SIGQUIT,
signal.SIGUSR1,
signal.SIGUSR2,
}
MAX_FILES_FROM_GLOB = 5
CORE_GLOB = "*core*"
def signal_handler(signum, frame):
logging.info("sending {} to the child process".format(signum))
if child_process is not None:
child_process.send_signal(signum)
def get_core_dump_dir():
core_pattern = None
with open("/proc/sys/kernel/core_pattern") as core_pattern_file:
core_pattern = core_pattern_file.readline()
logging.info("core_pattern is: {}".format(core_pattern.rstrip()))
if core_pattern.startswith("|"):
raise ValueError("core_pattern starts with |, can't do anything useful")
# abspath resolves any relative path from core_pattern. This
# script and the child process have same CWD, so the following
# call resolves to correct directory.
return os.path.abspath(os.path.dirname(core_pattern))
def create_core_dump_dir():
"""This function tries to create the base directory from the
core_pattern.
Any failures in creating the directory are logged as warnings.
"""
try:
core_dump_dir = get_core_dump_dir()
os.makedirs(core_dump_dir)
except Exception as error:
# TODO: use os.makedirs(core_dump_dir, exist_ok=True) when we
# move to newer enterprise Linux with Python 3.
if isinstance(error, OSError) and error.errno == errno.EEXIST:
# Don't warn if the directory already exist.
pass
else:
logging.warning(
"Core dumps might not get collected: "
+ "failure while creating the core dump directory: "
+ "{}: {}".format(type(error).__name__, error)
)
def copy_cores(dst):
# os.makedirs(dst, exist_ok=True)
try:
os.makedirs(dst)
except OSError as error:
# Don't raise error if the directory already exist.
if error.errno != errno.EEXIST:
raise error
total_files_copied = 0
dir_path = get_core_dump_dir()
if os.path.samefile(dir_path, dst):
logging.info(
"Skipping copy of core files: '{}' and '{}' are the same directories".format(
dir_path, dst
)
)
return total_files_copied
# TODO: parse the core_pattern to generate glob pattern instead of
# using simple glob
basename_glob = CORE_GLOB
core_glob = os.path.join(dir_path, basename_glob)
logging.info(
"Copying latest {} core files to '{}' using glob '{}'".format(
MAX_FILES_FROM_GLOB, dst, core_glob
)
)
core_files = glob(core_glob)
# TODO: handle cases where this list is huge, this is less likely
# to happen with the current glob *core*, but can happen with a
# generated one.
# Sort the files, latest first.
core_files.sort(key=os.path.getmtime, reverse=True)
for core_file in core_files:
if total_files_copied == MAX_FILES_FROM_GLOB:
logging.info("Reached max allowed core files, skipping the rest")
break
if not os.path.isfile(core_file):
logging.info("'{}' is not a regular file, skipping".format(core_file))
continue
logging.info("Copying core file '{}'".format(core_file))
shutil.copy(core_file, dst)
total_files_copied += 1
return total_files_copied
def invoke_hook(hook_stage=None):
"""
Invokes the kubernetes configmap hook with associated hook_stage.
"""
if hook_stage not in ("pre", "post"):
raise Exception("NotImplemented")
hostname = os.getenv("HOSTNAME")
hostname_parsed = "-".join(hostname.split("-")[-3:])
hook_filename = "{}-{}_debug_hook.sh".format(hostname_parsed, hook_stage)
op_name = "{}_{}_{}".format(hostname, hook_stage, "debug_hook")
hook_filepath = os.path.join("/opt/debug_hooks_config/", hook_filename)
if os.path.exists(hook_filepath):
logging.info("Executing operation: {} filepath: {}".format(op_name, hook_filepath))
# TODO: Do we care about capturing ret code,
# exception since exceptions will be logged by default
output = subprocess.check_output(hook_filepath, shell=True)
logging.info("Output from hook {}".format(output))
if __name__ == "__main__":
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(filename)s: %(message)s",
level=logging.INFO,
)
command = sys.argv[1:]
if len(command) < 1:
logging.critical("No command to run")
sys.exit(1)
cores_dir = os.getenv("YBDEVOPS_CORECOPY_DIR")
if cores_dir is None:
logging.critical("YBDEVOPS_CORECOPY_DIR environment variable must be set")
sys.exit(1)
logging.info("Core files will be copied to '{}'".format(cores_dir))
# Make sure the directory from core_pattern is present, otherwise
# core dumps are not collected.
create_core_dump_dir()
# invoking the prehook.
invoke_hook(hook_stage="pre")
child_process = subprocess.Popen(command)
# TODO/RFC: how to handle the failures which happen after
# this point? Need some way to terminate the DB process?
# Set signal handler.
for sig in PROPAGATE_SIGS:
signal.signal(sig, signal_handler)
child_process.wait()
# invoking the Post hook
invoke_hook(hook_stage="post")
# Do the core copy, and exit with child return code.
files_copied = copy_cores(cores_dir)
logging.info("Copied {} core files to '{}'".format(files_copied, cores_dir))
sys.exit(child_process.returncode)