-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathalice-strace-retrieve-symbols
executable file
·200 lines (171 loc) · 7.45 KB
/
alice-strace-retrieve-symbols
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python
# Pretty-print the output of strace++, using gdb to print out the
# function/file/line info for stack traces
#
# argv[1] - output from strace++ (use -o <outfile> option to create the trace file)
#
# (also requires the 'file' program to be installed in addition to 'gdb')
#
# by Philip Guo
import os, sys, re, subprocess, cPickle
from collections import defaultdict, namedtuple
import commands
# Return a symbol table, which is a dict where:
#
# Key: Filename
# Value: Dict where ...
# Key: hex address (string)
# Value: (function name, instruction offset, filename, line number)
# Any of those fields might be null when there isn't adequate debug info
#
# containing all the debug info needed to pretty-print the entries
#
# Input: fn is the filename of the strace++ output trace file
def create_symtab(fn_list):
# each element is a string representing a return address, e.g.,:
# '/lib32/libc-2.11.1.so:0x6990d:0xf769390d'
# it's a colon-separated triple containing:
# 1.) absolute path to the binary
# 2.) our best guess at the offset within that binary
# 3.) the original return address (in case the calculated offset is bogus)
return_addrs_set = set()
for fn in fn_list:
# do a first pass to find ALL return addresses, so that we can call gdb to do a lookup
for line in open(fn):
# look for a raw stack trace of addrs like:
# [ /lib32/libc-2.11.1.so:0x67aef:0xf75ccaef /lib32/libc-2.11.1.so:0x67e06:0xf75cce06 ]
if line.strip() == '[]':
continue
if line[0] == '[':
first_rb = line.find(']')
stack_addrs = line[1:first_rb].strip()
if stack_addrs:
stack_addrs = stack_addrs.split()
for addr in stack_addrs:
return_addrs_set.add(addr)
# Key: filename
# Value: set of (addr_offset, original_addr)
d = defaultdict(set)
for e in return_addrs_set:
try:
filename, addr_offset, original_addr = e.split(':')
except:
print e
assert False
d[filename].add((addr_offset, original_addr))
# Key: filename
# Value: list of addresses to query (strings representing hex numbers)
filenames_to_addrs = defaultdict(list)
for filename, addrs_set in d.iteritems():
# use the following heuristic to determine which address to use:
# - if the file is an 'executable', then use original_addr
# - otherwise if the file is a 'shared object', then use addr_offset
#
# shared objects are usually mmapped into "high" addresses and thus need an
# addr_offset, while exectuables usually do NOT need an offset and can instead
# use their original_addr to do symbol lookups
(file_out, _) = subprocess.Popen(['file', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if 'shared object' in file_out:
for (addr_offset, _) in addrs_set:
filenames_to_addrs[filename].append(addr_offset)
elif 'executable' in file_out:
for (_, original_addr) in addrs_set:
filenames_to_addrs[filename].append(original_addr)
else:
print >> sys.stderr, "Warning:", filename, "doesn't appear to be an executable or shared library"
return get_symbol_table_using_gdb(filenames_to_addrs)
# some fields might be null if there isn't adequate debug info
SymbolTableEntry = namedtuple('SymbolTableEntry',
['func_name', 'instr_offset', 'src_filename', 'src_line_num'])
# Use gdb to probe the debug info of binaries in order to return a symbol
# table, which is structured as a dict where:
#
# Key: Filename
# Value: Dict where ...
# Key: hex address (string)
# Value: a SymbolTableEntry object
#
# The advantage of using gdb is that you can usually get file/line info, and gdb
# supports "splitdebug" binaries where the debug info is stored in a separate
# binary linked with .gnu_debuglink
# (See: http://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html)
#
# The disadvantage is that gdb is quite heavyweight. Also, when you can't get
# .gnu_debuglink to work with custom paths (e.g., on Chrome OS), then gdb
# won't print out the proper debug info. TODO: try to look into improving this!
#
# Input: filenames_to_addrs is a dict mapping each binary filename to a list of
# addresses (strings representing hex numbers) on which to query for debug info
def get_symbol_table_using_gdb(filenames_to_addrs):
ret = defaultdict(dict)
lineRE = re.compile('Line (\d+) of "(.*)" starts at address 0x\S+ <(.*?)> and ends at 0x\S+')
# even if there's no line number info, it might give you the function name
# e.g., in "No line number information available for address 0x857 <_dl_start_user>"
# at least you can find out that the function name is _dl_start_user
noLineInfoRE = re.compile('No line number information available for address 0x\S+ <(.*?)>')
# for each file, create a gdb script to introspect all elements of addr_list
for filename, addrs_lst in filenames_to_addrs.iteritems():
# now create a gdb script with some filler and the critical line that makes
# the query for debug info: 'info line *<addr>'
tmp_gdb_script = open('temp.gdb', 'w')
for addr in sorted(addrs_lst):
print >> tmp_gdb_script, 'echo ===\\n'
print >> tmp_gdb_script, 'echo ' + addr + '\\n'
print >> tmp_gdb_script, 'info line *' + addr
tmp_gdb_script.close() # force write to disk, or else temp.gdb will be empty!
# now run:
# gdb <filename> -batch -x temp.gdb
# and harvest its stdout
# ( -batch mode allows gdb to produce 'clean' output and be run as a subprocess
# see: http://ftp.gnu.org/old-gnu/Manuals/gdb-5.1.1/html_node/gdb_8.html )
(gdb_stdout, gdb_stderr) = subprocess.Popen(['gdb', filename, '-batch', '-x', 'temp.gdb'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if gdb_stderr:
print >> sys.stderr, "GDB warnings while processing %s:" % (filename,), gdb_stderr,
# parse the output of gdb, where each record is:
# ===
# <hex address>
# one or more lines containing the output of gdb (which should be appended together later)
tokens = gdb_stdout.split('===')
for t in tokens:
if not t:
continue
# collapse all space-like characters into a single space to simplify parsing later
t = re.sub('\s+', ' ', t).strip()
hex_addr = t.split()[0].strip()
# gdb output is the REST of the line
gdb_out = t[len(hex_addr):].strip()
#print hex_addr, gdb_out
assert hex_addr.startswith('0x')
m = lineRE.match(gdb_out)
if m:
(linenum, src_filename, funcname) = m.groups()
# split up "funcname+offset", e.g., 'main+21'
s = funcname.split('+')
# don't just assume that funcname splits into either 1 or 2 components. Sometimes
# there are weird function names like "STRING::operator+=(char const*)+91"
# containing a '+' in the function name!!!
if len(s) > 1:
offset = int(s[-1]) # the FINAL component should be the offset number
funcname = '+'.join(s[:-1]) # join the REST of the components into funcname
else:
offset = 0
funcname = s[0]
ret[filename][hex_addr] = tuple(SymbolTableEntry(funcname, offset, src_filename, int(linenum)))
else:
m = noLineInfoRE.match(gdb_out)
if m:
funcname = m.group(1)
s = funcname.split('+')
assert len(s) <= 2
offset = 0
if len(s) == 2:
offset = int(s[1])
funcname = s[0]
ret[filename][hex_addr] = tuple(SymbolTableEntry(funcname, offset, None, None))
return ret
strace_file_prefix = sys.argv[1]
output_file = strace_file_prefix + '.symtab'
stack_files = commands.getoutput("ls " + strace_file_prefix + "*stackinfo*").split()
symtab = create_symtab(stack_files)
cPickle.dump(symtab, open(output_file, "wb"))