File indexing completed on 2024-04-14 14:12:13

0001 #!/usr/bin/env python
0002 #===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
0003 #
0004 #                     The LLVM Compiler Infrastructure
0005 #
0006 # This file is distributed under the University of Illinois Open Source
0007 # License. See LICENSE.TXT for details.
0008 #
0009 #===------------------------------------------------------------------------===#
0010 import argparse
0011 import bisect
0012 import getopt
0013 import os
0014 import re
0015 import subprocess
0016 import sys
0017 
0018 symbolizers = {}
0019 DEBUG = False
0020 demangle = False
0021 binutils_prefix = None
0022 sysroot_path = None
0023 binary_name_filter = None
0024 fix_filename_patterns = None
0025 logfile = sys.stdin
0026 allow_system_symbolizer = True
0027 
0028 # FIXME: merge the code that calls fix_filename().
0029 def fix_filename(file_name):
0030   if fix_filename_patterns:
0031     for path_to_cut in fix_filename_patterns:
0032       file_name = re.sub('.*' + path_to_cut, '', file_name)
0033   file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
0034   file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
0035   return file_name
0036 
0037 def sysroot_path_filter(binary_name):
0038   return sysroot_path + binary_name
0039 
0040 def guess_arch(addr):
0041   # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
0042   if len(addr) > 10:
0043     return 'x86_64'
0044   else:
0045     return 'i386'
0046 
0047 class Symbolizer(object):
0048   def __init__(self):
0049     pass
0050 
0051   def symbolize(self, addr, binary, offset):
0052     """Symbolize the given address (pair of binary and offset).
0053 
0054     Overridden in subclasses.
0055     Args:
0056         addr: virtual address of an instruction.
0057         binary: path to executable/shared object containing this instruction.
0058         offset: instruction offset in the @binary.
0059     Returns:
0060         list of strings (one string for each inlined frame) describing
0061         the code locations for this instruction (that is, function name, file
0062         name, line and column numbers).
0063     """
0064     return None
0065 
0066 
0067 class LLVMSymbolizer(Symbolizer):
0068   def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
0069     super(LLVMSymbolizer, self).__init__()
0070     self.symbolizer_path = symbolizer_path
0071     self.default_arch = default_arch
0072     self.system = system
0073     self.dsym_hints = dsym_hints
0074     self.pipe = self.open_llvm_symbolizer()
0075 
0076   def open_llvm_symbolizer(self):
0077     cmd = [self.symbolizer_path,
0078            '--use-symbol-table=true',
0079            '--demangle=%s' % demangle,
0080            '--functions=short',
0081            '--inlining=true',
0082            '--default-arch=%s' % self.default_arch]
0083     if self.system == 'Darwin':
0084       for hint in self.dsym_hints:
0085         cmd.append('--dsym-hint=%s' % hint)
0086     if DEBUG:
0087       print ' '.join(cmd)
0088     try:
0089       result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
0090                                 stdout=subprocess.PIPE)
0091     except OSError:
0092       result = None
0093     return result
0094 
0095   def symbolize(self, addr, binary, offset):
0096     """Overrides Symbolizer.symbolize."""
0097     if not self.pipe:
0098       return None
0099     result = []
0100     try:
0101       symbolizer_input = '"%s" %s' % (binary, offset)
0102       if DEBUG:
0103         print symbolizer_input
0104       print >> self.pipe.stdin, symbolizer_input
0105       while True:
0106         function_name = self.pipe.stdout.readline().rstrip()
0107         if not function_name:
0108           break
0109         file_name = self.pipe.stdout.readline().rstrip()
0110         file_name = fix_filename(file_name)
0111         if (not function_name.startswith('??') or
0112             not file_name.startswith('??')):
0113           # Append only non-trivial frames.
0114           result.append('%s in %s %s' % (addr, function_name,
0115                                          file_name))
0116     except Exception:
0117       result = []
0118     if not result:
0119       result = None
0120     return result
0121 
0122 
0123 def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
0124   symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
0125   if not symbolizer_path:
0126     symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
0127     if not symbolizer_path:
0128       # Assume llvm-symbolizer is in PATH.
0129       symbolizer_path = 'llvm-symbolizer'
0130   return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
0131 
0132 
0133 class Addr2LineSymbolizer(Symbolizer):
0134   def __init__(self, binary):
0135     super(Addr2LineSymbolizer, self).__init__()
0136     self.binary = binary
0137     self.pipe = self.open_addr2line()
0138     self.output_terminator = -1
0139 
0140   def open_addr2line(self):
0141     addr2line_tool = 'addr2line'
0142     if binutils_prefix:
0143       addr2line_tool = binutils_prefix + addr2line_tool
0144     cmd = [addr2line_tool, '-fi']
0145     if demangle:
0146       cmd += ['--demangle']
0147     cmd += ['-e', self.binary]
0148     if DEBUG:
0149       print ' '.join(cmd)
0150     return subprocess.Popen(cmd,
0151                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
0152 
0153   def symbolize(self, addr, binary, offset):
0154     """Overrides Symbolizer.symbolize."""
0155     if self.binary != binary:
0156       return None
0157     lines = []
0158     try:
0159       print >> self.pipe.stdin, offset
0160       print >> self.pipe.stdin, self.output_terminator
0161       is_first_frame = True
0162       while True:
0163         function_name = self.pipe.stdout.readline().rstrip()
0164         file_name = self.pipe.stdout.readline().rstrip()
0165         if is_first_frame:
0166           is_first_frame = False
0167         elif function_name == '??':
0168           assert file_name == '??:0'
0169           break
0170         lines.append((function_name, file_name));
0171     except Exception:
0172       lines.append(('??', '??:0'))
0173     return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
0174 
0175 class UnbufferedLineConverter(object):
0176   """
0177   Wrap a child process that responds to each line of input with one line of
0178   output.  Uses pty to trick the child into providing unbuffered output.
0179   """
0180   def __init__(self, args, close_stderr=False):
0181     # Local imports so that the script can start on Windows.
0182     import pty
0183     import termios
0184     pid, fd = pty.fork()
0185     if pid == 0:
0186       # We're the child. Transfer control to command.
0187       if close_stderr:
0188         dev_null = os.open('/dev/null', 0)
0189         os.dup2(dev_null, 2)
0190       os.execvp(args[0], args)
0191     else:
0192       # Disable echoing.
0193       attr = termios.tcgetattr(fd)
0194       attr[3] = attr[3] & ~termios.ECHO
0195       termios.tcsetattr(fd, termios.TCSANOW, attr)
0196       # Set up a file()-like interface to the child process
0197       self.r = os.fdopen(fd, "r", 1)
0198       self.w = os.fdopen(os.dup(fd), "w", 1)
0199 
0200   def convert(self, line):
0201     self.w.write(line + "\n")
0202     return self.readline()
0203 
0204   def readline(self):
0205     return self.r.readline().rstrip()
0206 
0207 
0208 class DarwinSymbolizer(Symbolizer):
0209   def __init__(self, addr, binary):
0210     super(DarwinSymbolizer, self).__init__()
0211     self.binary = binary
0212     self.arch = guess_arch(addr)
0213     self.open_atos()
0214 
0215   def open_atos(self):
0216     if DEBUG:
0217       print 'atos -o %s -arch %s' % (self.binary, self.arch)
0218     cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
0219     self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
0220 
0221   def symbolize(self, addr, binary, offset):
0222     """Overrides Symbolizer.symbolize."""
0223     if self.binary != binary:
0224       return None
0225     atos_line = self.atos.convert('0x%x' % int(offset, 16))
0226     while "got symbolicator for" in atos_line:
0227       atos_line = self.atos.readline()
0228     # A well-formed atos response looks like this:
0229     #   foo(type1, type2) (in object.name) (filename.cc:80)
0230     match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
0231     if DEBUG:
0232       print 'atos_line: ', atos_line
0233     if match:
0234       function_name = match.group(1)
0235       function_name = re.sub('\(.*?\)', '', function_name)
0236       file_name = fix_filename(match.group(3))
0237       return ['%s in %s %s' % (addr, function_name, file_name)]
0238     else:
0239       return ['%s in %s' % (addr, atos_line)]
0240 
0241 
0242 # Chain several symbolizers so that if one symbolizer fails, we fall back
0243 # to the next symbolizer in chain.
0244 class ChainSymbolizer(Symbolizer):
0245   def __init__(self, symbolizer_list):
0246     super(ChainSymbolizer, self).__init__()
0247     self.symbolizer_list = symbolizer_list
0248 
0249   def symbolize(self, addr, binary, offset):
0250     """Overrides Symbolizer.symbolize."""
0251     for symbolizer in self.symbolizer_list:
0252       if symbolizer:
0253         result = symbolizer.symbolize(addr, binary, offset)
0254         if result:
0255           return result
0256     return None
0257 
0258   def append_symbolizer(self, symbolizer):
0259     self.symbolizer_list.append(symbolizer)
0260 
0261 
0262 def BreakpadSymbolizerFactory(binary):
0263   suffix = os.getenv('BREAKPAD_SUFFIX')
0264   if suffix:
0265     filename = binary + suffix
0266     if os.access(filename, os.F_OK):
0267       return BreakpadSymbolizer(filename)
0268   return None
0269 
0270 
0271 def SystemSymbolizerFactory(system, addr, binary):
0272   if system == 'Darwin':
0273     return DarwinSymbolizer(addr, binary)
0274   elif system == 'Linux':
0275     return Addr2LineSymbolizer(binary)
0276 
0277 
0278 class BreakpadSymbolizer(Symbolizer):
0279   def __init__(self, filename):
0280     super(BreakpadSymbolizer, self).__init__()
0281     self.filename = filename
0282     lines = file(filename).readlines()
0283     self.files = []
0284     self.symbols = {}
0285     self.address_list = []
0286     self.addresses = {}
0287     # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
0288     fragments = lines[0].rstrip().split()
0289     self.arch = fragments[2]
0290     self.debug_id = fragments[3]
0291     self.binary = ' '.join(fragments[4:])
0292     self.parse_lines(lines[1:])
0293 
0294   def parse_lines(self, lines):
0295     cur_function_addr = ''
0296     for line in lines:
0297       fragments = line.split()
0298       if fragments[0] == 'FILE':
0299         assert int(fragments[1]) == len(self.files)
0300         self.files.append(' '.join(fragments[2:]))
0301       elif fragments[0] == 'PUBLIC':
0302         self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
0303       elif fragments[0] in ['CFI', 'STACK']:
0304         pass
0305       elif fragments[0] == 'FUNC':
0306         cur_function_addr = int(fragments[1], 16)
0307         if not cur_function_addr in self.symbols.keys():
0308           self.symbols[cur_function_addr] = ' '.join(fragments[4:])
0309       else:
0310         # Line starting with an address.
0311         addr = int(fragments[0], 16)
0312         self.address_list.append(addr)
0313         # Tuple of symbol address, size, line, file number.
0314         self.addresses[addr] = (cur_function_addr,
0315                                 int(fragments[1], 16),
0316                                 int(fragments[2]),
0317                                 int(fragments[3]))
0318     self.address_list.sort()
0319 
0320   def get_sym_file_line(self, addr):
0321     key = None
0322     if addr in self.addresses.keys():
0323       key = addr
0324     else:
0325       index = bisect.bisect_left(self.address_list, addr)
0326       if index == 0:
0327         return None
0328       else:
0329         key = self.address_list[index - 1]
0330     sym_id, size, line_no, file_no = self.addresses[key]
0331     symbol = self.symbols[sym_id]
0332     filename = self.files[file_no]
0333     if addr < key + size:
0334       return symbol, filename, line_no
0335     else:
0336       return None
0337 
0338   def symbolize(self, addr, binary, offset):
0339     if self.binary != binary:
0340       return None
0341     res = self.get_sym_file_line(int(offset, 16))
0342     if res:
0343       function_name, file_name, line_no = res
0344       result = ['%s in %s %s:%d' % (
0345           addr, function_name, file_name, line_no)]
0346       print result
0347       return result
0348     else:
0349       return None
0350 
0351 
0352 class SymbolizationLoop(object):
0353   def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
0354     if sys.platform == 'win32':
0355       # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
0356       # even in sandboxed processes.  Nothing needs to be done here.
0357       self.process_line = self.process_line_echo
0358     else:
0359       # Used by clients who may want to supply a different binary name.
0360       # E.g. in Chrome several binaries may share a single .dSYM.
0361       self.binary_name_filter = binary_name_filter
0362       self.dsym_hint_producer = dsym_hint_producer
0363       self.system = os.uname()[0]
0364       if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
0365         raise Exception('Unknown system')
0366       self.llvm_symbolizers = {}
0367       self.last_llvm_symbolizer = None
0368       self.dsym_hints = set([])
0369       self.frame_no = 0
0370       self.process_line = self.process_line_posix
0371 
0372   def symbolize_address(self, addr, binary, offset):
0373     # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
0374     # a single symbolizer binary.
0375     # On Darwin, if the dsym hint producer is present:
0376     #  1. check whether we've seen this binary already; if so,
0377     #     use |llvm_symbolizers[binary]|, which has already loaded the debug
0378     #     info for this binary (might not be the case for
0379     #     |last_llvm_symbolizer|);
0380     #  2. otherwise check if we've seen all the hints for this binary already;
0381     #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
0382     #  3. otherwise create a new symbolizer and pass all currently known
0383     #     .dSYM hints to it.
0384     if not binary in self.llvm_symbolizers:
0385       use_new_symbolizer = True
0386       if self.system == 'Darwin' and self.dsym_hint_producer:
0387         dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
0388         use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
0389         self.dsym_hints |= dsym_hints_for_binary
0390       if self.last_llvm_symbolizer and not use_new_symbolizer:
0391           self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
0392       else:
0393         self.last_llvm_symbolizer = LLVMSymbolizerFactory(
0394             self.system, guess_arch(addr), self.dsym_hints)
0395         self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
0396     # Use the chain of symbolizers:
0397     # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
0398     # (fall back to next symbolizer if the previous one fails).
0399     if not binary in symbolizers:
0400       symbolizers[binary] = ChainSymbolizer(
0401           [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
0402     result = symbolizers[binary].symbolize(addr, binary, offset)
0403     if result is None:
0404       if not allow_system_symbolizer:
0405         raise Exception('Failed to launch or use llvm-symbolizer.')
0406       # Initialize system symbolizer only if other symbolizers failed.
0407       symbolizers[binary].append_symbolizer(
0408           SystemSymbolizerFactory(self.system, addr, binary))
0409       result = symbolizers[binary].symbolize(addr, binary, offset)
0410     # The system symbolizer must produce some result.
0411     assert result
0412     return result
0413 
0414   def get_symbolized_lines(self, symbolized_lines):
0415     if not symbolized_lines:
0416       return [self.current_line]
0417     else:
0418       result = []
0419       for symbolized_frame in symbolized_lines:
0420         result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
0421         self.frame_no += 1
0422       return result
0423 
0424   def process_logfile(self):
0425     self.frame_no = 0
0426     for line in logfile:
0427       processed = self.process_line(line)
0428       print '\n'.join(processed)
0429 
0430   def process_line_echo(self, line):
0431     return [line.rstrip()]
0432 
0433   def process_line_posix(self, line):
0434     self.current_line = line.rstrip()
0435     #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
0436     stack_trace_line_format = (
0437         '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
0438     match = re.match(stack_trace_line_format, line)
0439     if not match:
0440       return [self.current_line]
0441     if DEBUG:
0442       print line
0443     _, frameno_str, addr, binary, offset = match.groups()
0444     if frameno_str == '0':
0445       # Assume that frame #0 is the first frame of new stack trace.
0446       self.frame_no = 0
0447     original_binary = binary
0448     if self.binary_name_filter:
0449       binary = self.binary_name_filter(binary)
0450     symbolized_line = self.symbolize_address(addr, binary, offset)
0451     if not symbolized_line:
0452       if original_binary != binary:
0453         symbolized_line = self.symbolize_address(addr, binary, offset)
0454     return self.get_symbolized_lines(symbolized_line)
0455 
0456 
0457 if __name__ == '__main__':
0458   parser = argparse.ArgumentParser(
0459       formatter_class=argparse.RawDescriptionHelpFormatter,
0460       description='ASan symbolization script',
0461       epilog='Example of use:\n'
0462              'asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" '
0463              '-s "$HOME/SymbolFiles" < asan.log')
0464   parser.add_argument('path_to_cut', nargs='*',
0465                       help='pattern to be cut from the result file path ')
0466   parser.add_argument('-d','--demangle', action='store_true',
0467                       help='demangle function names')
0468   parser.add_argument('-s', metavar='SYSROOT',
0469                       help='set path to sysroot for sanitized binaries')
0470   parser.add_argument('-c', metavar='CROSS_COMPILE',
0471                       help='set prefix for binutils')
0472   parser.add_argument('-l','--logfile', default=sys.stdin,
0473                       type=argparse.FileType('r'),
0474                       help='set log file name to parse, default is stdin')
0475   args = parser.parse_args()
0476   if args.path_to_cut:
0477     fix_filename_patterns = args.path_to_cut
0478   if args.demangle:
0479     demangle = True
0480   if args.s:
0481     binary_name_filter = sysroot_path_filter
0482     sysroot_path = args.s
0483   if args.c:
0484     binutils_prefix = args.c
0485   if args.logfile:
0486     logfile = args.logfile
0487   else:
0488     logfile = sys.stdin
0489   loop = SymbolizationLoop(binary_name_filter)
0490   loop.process_logfile()