2 #===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
4 # The LLVM Compiler Infrastructure
6 # This file is distributed under the University of Illinois Open Source
7 # License. See https://github.com/llvm-mirror/compiler-rt/blob/master/LICENSE.TXT
10 #===------------------------------------------------------------------------===#
23 binutils_prefix = None
25 binary_name_filter = None
26 fix_filename_patterns = None
28 separate_debug_dir_list = []
31 # FIXME: merge the code that calls fix_filename().
32 def fix_filename(file_name):
33 if fix_filename_patterns:
34 for path_to_cut in fix_filename_patterns:
35 file_name = re.sub('.*' + path_to_cut, '', file_name)
36 file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
37 file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
40 def sysroot_path_filter(binary_name):
41 return sysroot_path + binary_name
43 def use_binutils_prefix(tool_name):
45 tool_name = binutils_prefix + tool_name
48 def print_error_message(message):
49 print >> sys.stderr, 'Error occured during symbolizisation: ' + message
51 class DebugInfoHandler(object):
52 def __init__(self, binary_name_filter=None):
53 self.binary_name_filter = binary_name_filter
54 self.global_debug_dir_list = [ self.use_name_filter("/usr/lib/debug") ]
55 if separate_debug_dir_list:
56 self.global_debug_dir_list = separate_debug_dir_list
58 def use_name_filter(self, binary_name):
59 if self.binary_name_filter:
60 binary_name = self.binary_name_filter(binary_name)
63 def calc_crc32(self, filename):
64 buf = open(filename,'rb').read()
65 buf = (binascii.crc32(buf, 0) & 0xFFFFFFFF)
68 def readelf_binary(self, options, binary):
69 cmd = [use_binutils_prefix('readelf'), options,'-W', binary]
71 process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=open(os.devnull, 'w'))
73 print_error_message('the following command failed:\n''"' + ' '.join(cmd) + '"')
75 (readelf_out, _) = process.communicate()
76 if process.returncode == 0:
79 def has_debuginfo(self, binary):
80 readelf_out = self.readelf_binary('-S', binary)
81 return readelf_out and (".debug_" in readelf_out)
83 def get_buildid(self, binary):
84 readelf_out = self.readelf_binary('-nw', binary)
86 Build ID is 40-length hex value following after "Build ID:".
88 Notes at offset 0x00000274 with length 0x00000024:
89 Owner Data size Description
90 GNU 0x00000014 NT_GNU_BUILD_ID (unique build ID bitstring)
91 Build ID: 977b1d1375ba6791d5f6dd38d8d55b95f9fca33a
94 readelf_lines = readelf_out.split("\n");
95 buildid_lines = filter(re.compile('Build ID:').search, readelf_lines)
96 for line in buildid_lines:
97 match = re.search('[a-f0-9]{40}', line)
101 print_error_message('failed to read Build ID value from ' + binary)
103 def get_debuglink_name(self, binary):
104 readelf_out = self.readelf_binary('--string-dump=.gnu_debuglink', binary)
106 "debug link" is the last word in the first line of dump.
108 String dump of section '.gnu_debuglink':
109 [ 0] HeapOutOfBounds.out.debug
112 readelf_lines = filter(None, readelf_out.split("\n"));
113 headline ="String dump of section '.gnu_debuglink':"
115 debuglink_line_idx = readelf_lines.index(headline) + 1
117 # There is no gnu_debuglink section in this binary
119 if len(readelf_lines) > debuglink_line_idx:
120 return readelf_lines[debuglink_line_idx].split()[-1]
122 print_error_message('failed to read debuglink value from ' + binary)
124 def get_debuglink_crc(self, binary):
125 readelf_out = self.readelf_binary('--hex-dump=.gnu_debuglink', binary)
127 crc is the last hex group (before string representation) in the last line of dump.
128 e.g. (crc is f89f21c2) :
129 Hex dump of section '.gnu_debuglink':
130 0x00000000 48656170 4f75744f 66426f75 6e64732e HeapOutOfBounds.
131 0x00000010 6f75742e 64656275 67000000 f89f21c2 out.debug.....!.
134 # get last non-empty line
135 crc_line = filter(None, readelf_out.split("\n"))[-1]
136 # remove last 16 characters (string dump) from line
137 crc_line = crc_line[:-16]
138 # crc is last word in string
139 crc = crc_line.split()[-1]
140 match = re.match('[a-f0-9]{8}', crc)
142 if sys.byteorder == 'little':
143 crc = array.array('i', binascii.unhexlify(crc) )
145 crc = binascii.hexlify(crc)
148 print_error_message('failed to get crc checksum from debuglink in ' + binary)
151 def is_prelinked(self, binary):
152 readelf_out = self.readelf_binary('-S', binary)
153 return readelf_out and ".gnu.prelink_undo" in readelf_out
155 def get_load_address(self, binary):
156 readelf_out = self.readelf_binary('-l', binary)
158 Readelf program headers output example:
159 Elf file type is DYN (Shared object file)
160 Entry point 0xb1160668
161 There are 10 program headers, starting at offset 52
163 Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align
164 EXIDX 0x124754 0xb126c754 0xb126c754 0x04498 0x04498 R 0x4
166 LOAD 0x000000 0xb1148000 0xb1148000 0x12be9c 0x12be9c R E 0x8000
169 readelf_lines = readelf_out.split("\n");
170 load_lines = filter(re.compile(
171 '[\s]*LOAD[\s]+0x[0]+[\s]+0x[a-fA-f\d]+').match, readelf_lines)
173 return load_lines[0].split()[2]
175 print_error_message('failed to get load address in ' + binary)
177 def get_prelink_offset(self, orig, prelinked):
178 if not self.is_prelinked(prelinked):
180 orig_load_addr = self.get_load_address(orig)
181 prelinked_load_addr = self.get_load_address(prelinked)
182 return int(prelinked_load_addr, 16) - int(orig_load_addr, 16)
184 def locate_in_orig_dir(self, debuglink_name, orig_binary_name):
185 debuginfo = os.path.join(os.path.dirname(orig_binary_name), debuglink_name)
186 debuginfo = self.use_name_filter(debuginfo)
189 def locate_in_debug_subdir(self, debuglink_name, orig_binary_name):
190 debuginfo = os.path.join(os.path.dirname(orig_binary_name), '.debug', debuglink_name)
191 debuginfo = self.use_name_filter(debuginfo)
194 def locate_in_global_debug_dir(self, debuglink_name, orig_binary_name):
196 for global_debug_dir in self.global_debug_dir_list:
197 debuginfo = global_debug_dir + os.path.join(os.path.dirname(orig_binary_name), debuglink_name)
198 debuginfo_list.append(debuginfo)
199 return debuginfo_list
201 def locate_in_buildid_dir(self, debuglink_name, orig_binary_name):
202 dir_name = debuglink_name[0:2]
203 file_name = debuglink_name[2:] + ".debug"
205 for global_debug_dir in self.global_debug_dir_list:
206 debuginfo = os.path.join(global_debug_dir, ".build-id", dir_name, file_name)
207 debuginfo_list.append(debuginfo)
208 return debuginfo_list
210 def get_debuginfo(self,binary_name):
211 global prelink_offset
212 prelink_offset = None
213 orig_binary_name = binary_name
214 # First apply sysroot path if defined to get real binary
215 real_binary = self.use_name_filter(binary_name)
216 if self.has_debuginfo(real_binary):
218 # Look for debuginfo file according to GDB rules
219 # 1) "build-id" method
220 buildid_name = self.get_buildid(real_binary)
223 debugfile_list = self.locate_in_buildid_dir(buildid_name, orig_binary_name)
224 for debugfile in debugfile_list:
225 if os.path.isfile(debugfile):
226 prelink_offset = self.get_prelink_offset(debugfile, real_binary)
228 # 2) "debug link" method
229 debuglink_name = self.get_debuglink_name(real_binary)
230 debuglink_crc = self.get_debuglink_crc(real_binary)
232 if debuglink_name and debuglink_crc:
233 debuglink_locate_list = [
234 self.locate_in_orig_dir,
235 self.locate_in_debug_subdir,
236 self.locate_in_global_debug_dir ]
237 for debuglink_locate_function in debuglink_locate_list:
238 debugfile_list = debuglink_locate_function(debuglink_name, orig_binary_name)
239 for debugfile in debugfile_list:
240 if os.path.isfile(debugfile):
241 debugfile_crc = self.calc_crc32(debugfile)
242 if int(debuglink_crc,16) == int(debugfile_crc, 16):
243 prelink_offset = self.get_prelink_offset(debugfile, real_binary)
247 def guess_arch(addr):
248 # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
254 class Symbolizer(object):
258 def symbolize(self, addr, binary, offset):
259 """Symbolize the given address (pair of binary and offset).
261 Overriden in subclasses.
263 addr: virtual address of an instruction.
264 binary: path to executable/shared object containing this instruction.
265 offset: instruction offset in the @binary.
267 list of strings (one string for each inlined frame) describing
268 the code locations for this instruction (that is, function name, file
269 name, line and column numbers).
274 class LLVMSymbolizer(Symbolizer):
275 def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
276 super(LLVMSymbolizer, self).__init__()
277 self.symbolizer_path = symbolizer_path
278 self.default_arch = default_arch
280 self.dsym_hints = dsym_hints
281 self.pipe = self.open_llvm_symbolizer()
283 def open_llvm_symbolizer(self):
284 cmd = [self.symbolizer_path,
285 '--use-symbol-table=true',
286 '--demangle=%s' % demangle,
289 '--default-arch=%s' % self.default_arch]
290 if self.system == 'Darwin':
291 for hint in self.dsym_hints:
292 cmd.append('--dsym-hint=%s' % hint)
296 result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
297 stdout=subprocess.PIPE)
302 def symbolize(self, addr, binary, offset):
303 """Overrides Symbolizer.symbolize."""
308 symbolizer_input = '"%s" %s' % (binary, offset)
310 print symbolizer_input
311 print >> self.pipe.stdin, symbolizer_input
313 function_name = self.pipe.stdout.readline().rstrip()
314 if not function_name:
316 file_name = self.pipe.stdout.readline().rstrip()
317 file_name = fix_filename(file_name)
318 if (not function_name.startswith('??') or
319 not file_name.startswith('??')):
320 # Append only non-trivial frames.
321 result.append('%s in %s %s' % (addr, function_name,
330 def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
331 symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
332 if not symbolizer_path:
333 symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
334 if not symbolizer_path:
335 # Assume llvm-symbolizer is in PATH.
336 symbolizer_path = 'llvm-symbolizer'
337 return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
340 class Addr2LineSymbolizer(Symbolizer):
341 def __init__(self, binary):
342 super(Addr2LineSymbolizer, self).__init__()
345 def symbolize(self, addr, binary, offset):
346 """Overrides Symbolizer.symbolize."""
347 cmd = [use_binutils_prefix('addr2line'), '-fi']
349 cmd += ['--demangle']
350 cmd += ['-e', self.binary, offset]
353 self.pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE)
355 if self.binary != binary:
358 lines = self.pipe.stdout.readlines()
364 for i in range(0, len(lines), 2):
365 function_name = lines[i].rstrip()
366 file_name = fix_filename(lines[i+1].rstrip())
367 result.append('%s in %s %s' % (addr, function_name, file_name))
371 class UnbufferedLineConverter(object):
373 Wrap a child process that responds to each line of input with one line of
374 output. Uses pty to trick the child into providing unbuffered output.
376 def __init__(self, args, close_stderr=False):
377 # Local imports so that the script can start on Windows.
382 # We're the child. Transfer control to command.
384 dev_null = os.open('/dev/null', 0)
386 os.execvp(args[0], args)
389 attr = termios.tcgetattr(fd)
390 attr[3] = attr[3] & ~termios.ECHO
391 termios.tcsetattr(fd, termios.TCSANOW, attr)
392 # Set up a file()-like interface to the child process
393 self.r = os.fdopen(fd, "r", 1)
394 self.w = os.fdopen(os.dup(fd), "w", 1)
396 def convert(self, line):
397 self.w.write(line + "\n")
398 return self.readline()
401 return self.r.readline().rstrip()
404 class DarwinSymbolizer(Symbolizer):
405 def __init__(self, addr, binary):
406 super(DarwinSymbolizer, self).__init__()
408 self.arch = guess_arch(addr)
413 print 'atos -o %s -arch %s' % (self.binary, self.arch)
414 cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
415 self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
417 def symbolize(self, addr, binary, offset):
418 """Overrides Symbolizer.symbolize."""
419 if self.binary != binary:
421 atos_line = self.atos.convert('0x%x' % int(offset, 16))
422 while "got symbolicator for" in atos_line:
423 atos_line = self.atos.readline()
424 # A well-formed atos response looks like this:
425 # foo(type1, type2) (in object.name) (filename.cc:80)
426 match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
428 print 'atos_line: ', atos_line
430 function_name = match.group(1)
431 function_name = re.sub('\(.*?\)', '', function_name)
432 file_name = fix_filename(match.group(3))
433 return ['%s in %s %s' % (addr, function_name, file_name)]
435 return ['%s in %s' % (addr, atos_line)]
438 # Chain several symbolizers so that if one symbolizer fails, we fall back
439 # to the next symbolizer in chain.
440 class ChainSymbolizer(Symbolizer):
441 def __init__(self, symbolizer_list):
442 super(ChainSymbolizer, self).__init__()
443 self.symbolizer_list = symbolizer_list
445 def symbolize(self, addr, binary, offset):
446 """Overrides Symbolizer.symbolize."""
447 for symbolizer in self.symbolizer_list:
449 result = symbolizer.symbolize(addr, binary, offset)
454 def append_symbolizer(self, symbolizer):
455 self.symbolizer_list.append(symbolizer)
458 def BreakpadSymbolizerFactory(binary):
459 suffix = os.getenv('BREAKPAD_SUFFIX')
461 filename = binary + suffix
462 if os.access(filename, os.F_OK):
463 return BreakpadSymbolizer(filename)
467 def SystemSymbolizerFactory(system, addr, binary):
468 if system == 'Darwin':
469 return DarwinSymbolizer(addr, binary)
470 elif system == 'Linux':
471 return Addr2LineSymbolizer(binary)
474 class BreakpadSymbolizer(Symbolizer):
475 def __init__(self, filename):
476 super(BreakpadSymbolizer, self).__init__()
477 self.filename = filename
478 lines = file(filename).readlines()
481 self.address_list = []
483 # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
484 fragments = lines[0].rstrip().split()
485 self.arch = fragments[2]
486 self.debug_id = fragments[3]
487 self.binary = ' '.join(fragments[4:])
488 self.parse_lines(lines[1:])
490 def parse_lines(self, lines):
491 cur_function_addr = ''
493 fragments = line.split()
494 if fragments[0] == 'FILE':
495 assert int(fragments[1]) == len(self.files)
496 self.files.append(' '.join(fragments[2:]))
497 elif fragments[0] == 'PUBLIC':
498 self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
499 elif fragments[0] in ['CFI', 'STACK']:
501 elif fragments[0] == 'FUNC':
502 cur_function_addr = int(fragments[1], 16)
503 if not cur_function_addr in self.symbols.keys():
504 self.symbols[cur_function_addr] = ' '.join(fragments[4:])
506 # Line starting with an address.
507 addr = int(fragments[0], 16)
508 self.address_list.append(addr)
509 # Tuple of symbol address, size, line, file number.
510 self.addresses[addr] = (cur_function_addr,
511 int(fragments[1], 16),
514 self.address_list.sort()
516 def get_sym_file_line(self, addr):
518 if addr in self.addresses.keys():
521 index = bisect.bisect_left(self.address_list, addr)
525 key = self.address_list[index - 1]
526 sym_id, size, line_no, file_no = self.addresses[key]
527 symbol = self.symbols[sym_id]
528 filename = self.files[file_no]
529 if addr < key + size:
530 return symbol, filename, line_no
534 def symbolize(self, addr, binary, offset):
535 if self.binary != binary:
537 res = self.get_sym_file_line(int(offset, 16))
539 function_name, file_name, line_no = res
540 result = ['%s in %s %s:%d' % (
541 addr, function_name, file_name, line_no)]
548 class SymbolizationLoop(object):
549 def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
550 if sys.platform == 'win32':
551 # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
552 # even in sandboxed processes. Nothing needs to be done here.
553 self.process_line = self.process_line_echo
555 # Used by clients who may want to supply a different binary name.
556 # E.g. in Chrome several binaries may share a single .dSYM.
557 self.binary_name_filter = binary_name_filter
558 self.dsym_hint_producer = dsym_hint_producer
559 self.system = os.uname()[0]
560 if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
561 raise Exception('Unknown system')
562 self.llvm_symbolizers = {}
563 self.last_llvm_symbolizer = None
564 self.dsym_hints = set([])
566 self.process_line = self.process_line_posix
568 def symbolize_address(self, addr, binary, offset):
569 # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
570 # a single symbolizer binary.
571 # On Darwin, if the dsym hint producer is present:
572 # 1. check whether we've seen this binary already; if so,
573 # use |llvm_symbolizers[binary]|, which has already loaded the debug
574 # info for this binary (might not be the case for
575 # |last_llvm_symbolizer|);
576 # 2. otherwise check if we've seen all the hints for this binary already;
577 # if so, reuse |last_llvm_symbolizer| which has the full set of hints;
578 # 3. otherwise create a new symbolizer and pass all currently known
580 if not binary in self.llvm_symbolizers:
581 use_new_symbolizer = True
582 if self.system == 'Darwin' and self.dsym_hint_producer:
583 dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
584 use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
585 self.dsym_hints |= dsym_hints_for_binary
586 if self.last_llvm_symbolizer and not use_new_symbolizer:
587 self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
589 self.last_llvm_symbolizer = LLVMSymbolizerFactory(
590 self.system, guess_arch(addr), self.dsym_hints)
591 self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
592 # Use the chain of symbolizers:
593 # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
594 # (fall back to next symbolizer if the previous one fails).
595 if not binary in symbolizers:
596 symbolizers[binary] = ChainSymbolizer(
597 [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
598 result = symbolizers[binary].symbolize(addr, binary, offset)
600 # Initialize system symbolizer only if other symbolizers failed.
601 symbolizers[binary].append_symbolizer(
602 SystemSymbolizerFactory(self.system, addr, binary))
603 result = symbolizers[binary].symbolize(addr, binary, offset)
604 # The system symbolizer must produce some result.
608 def get_symbolized_lines(self, symbolized_lines):
609 if not symbolized_lines:
610 return [self.current_line]
613 for symbolized_frame in symbolized_lines:
614 if '?' in symbolized_frame:
615 symbolized_frame += " " + re.search('\(.*?\)',self.current_line).group(0)
616 result.append(' #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
620 def process_logfile(self):
623 processed = self.process_line(line)
624 print '\n'.join(processed)
626 def process_line_echo(self, line):
627 return [line.rstrip()]
629 def process_line_posix(self, line):
630 self.current_line = line.rstrip()
631 #0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45)
632 stack_trace_line_format = (
633 '^( *#([0-9]+) *)(0x[0-9a-f]+)( *in [^/]+)? *\((.*)\+(0x[0-9a-f]+)\)')
634 match = re.match(stack_trace_line_format, line)
636 return [self.current_line]
639 _, frameno_str, addr, func, binary, offset = match.groups()
640 if frameno_str == '0':
641 # Assume that frame #0 is the first frame of new stack trace.
643 original_binary = binary
644 if self.binary_name_filter:
645 binary = self.binary_name_filter(binary)
646 # Correct offset from backtrace if the binary was prelinked
647 # and printed address considers the prelink offset:
649 real_offset = int(offset,16)
650 if real_offset > prelink_offset:
651 #FIXME: Need to check that offset fits section size
652 offset = hex(real_offset - prelink_offset)
654 print 'real address: ' + offset
655 symbolized_line = self.symbolize_address(addr, binary, offset)
656 if not symbolized_line:
657 if original_binary != binary:
658 symbolized_line = self.symbolize_address(addr, binary, offset)
659 return self.get_symbolized_lines(symbolized_line)
662 if __name__ == '__main__':
663 parser = argparse.ArgumentParser(
664 formatter_class=argparse.RawDescriptionHelpFormatter,
665 description='ASan symbolization script',
666 epilog='Example of use:\n'
667 'asan_symbolize.py -c "$HOME/opt/cross/bin/armv7l-tizen-linux-gnueabi-" '
668 '-s "$HOME/SymbolFiles" < asan.log')
669 parser.add_argument('path_to_cut', nargs='*',
670 help='pattern to be cut from the result file path ')
671 parser.add_argument('-d','--demangle', action='store_true',
672 help='demangle function names')
673 parser.add_argument('-s', metavar='SYSROOT',
674 help='set path to sysroot for sanitized binaries')
675 parser.add_argument('-c', metavar='CROSS_COMPILE',
676 help='set prefix for binutils')
677 parser.add_argument('-l','--logfile', default=sys.stdin,
678 type=argparse.FileType('r'),
679 help='set log file name to parse, default is stdin')
680 parser.add_argument('-y', '--debug-file-directory', metavar='DEBUGDIR',
681 help='The directories for separate debug information \
682 files. Multiple path components can be set concatenating \
683 them by a path separator.')
684 args = parser.parse_args()
686 fix_filename_patterns = args.path_to_cut
690 binary_name_filter = sysroot_path_filter
691 sysroot_path = args.s
693 binutils_prefix = args.c
695 logfile = args.logfile
698 if args.debug_file_directory:
699 separate_debug_dir_list = args.debug_file_directory.split(":")
700 if os.uname()[0] == 'Linux':
701 debug_info_handler = DebugInfoHandler(binary_name_filter)
702 binary_name_filter = debug_info_handler.get_debuginfo
703 loop = SymbolizationLoop(binary_name_filter)
704 loop.process_logfile()