src/tools/memory_inspector/memory_inspector/classification/native_heap_classifier.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """This module classifies NativeHeap objects filtering their allocations.
   6
   7 The only filter currently available is 'stacktrace', which works as follows:
   8
   9 {'name': 'rule-1', 'stacktrace': 'foo' }
  10 {'name': 'rule-2', 'stacktrace': ['foo', r'bar\s+baz']}
  11 {'name': 'rule-3', 'source_path': 'sk.*allocator'}
  12 {'name': 'rule-3', 'source_path': 'sk', 'stacktrace': 'SkAllocator'}
  13
  14
  15 rule-1 will match any allocation that has 'foo' in one of its  stack frames.
  16 rule-2 will match any allocation that has a stack frame matching 'foo' AND a
  17 followed by a stack frame matching 'bar baz'. Note that order matters, so rule-2
  18 will not match a stacktrace like ['bar baz', 'foo'].
  19 rule-3 will match any allocation in which at least one of the source paths in
  20 its stack frames matches the regex sk.*allocator.
  21 rule-4 will match any allocation which satisfies both the conditions.
  22
  23 TODO(primiano): introduce more filters after the first prototype with UI, for
  24 instance, filter by library file name or by allocation size.
  25 """
  26
  27 import collections
  28 import posixpath
  29 import re
  30
  31 from memory_inspector.classification import results
  32 from memory_inspector.classification import rules
  33 from memory_inspector.core import exceptions
  34 from memory_inspector.core import native_heap
  35
  36
  37 _RESULT_KEYS = ['bytes_allocated', 'bytes_resident']
  38
  39
  40 def LoadRules(content):
  41   """Loads and parses a native-heap rule tree from a content (string).
  42
  43   Returns:
  44     An instance of |rules.Rule|, which nodes are |_NHeapRule| instances.
  45   """
  46   return rules.Load(content, _NHeapRule)
  47
  48
  49 def Classify(nativeheap, rule_tree):
  50   """Creates aggregated results of native heaps using the provided rules.
  51
  52   Args:
  53     nativeheap: the heap dump being processed (a |NativeHeap| instance).
  54     rule_tree: the user-defined rules that define the filtering categories.
  55
  56   Returns:
  57     An instance of |AggreatedResults|.
  58   """
  59   assert(isinstance(nativeheap, native_heap.NativeHeap))
  60   assert(isinstance(rule_tree, rules.Rule))
  61
  62   res = results.AggreatedResults(rule_tree, _RESULT_KEYS)
  63   for allocation in nativeheap.allocations:
  64     res.AddToMatchingNodes(allocation,
  65                            [allocation.size, allocation.resident_size])
  66   return res
  67
  68
  69 def InferHeuristicRulesFromHeap(nheap, max_depth=3, threshold=0.02):
  70   """Infers the rules tree from a symbolized heap snapshot.
  71
  72   In lack of a specific set of rules, this method can be invoked to infer a
  73   meaningful rule tree starting from a heap snapshot. It will build a compact
  74   radix tree from the source paths of the stack traces, which height is at most
  75   |max_depth|, selecting only those nodes which contribute for at least
  76   |threshold| (1.0 = 100%) w.r.t. the total allocation of the heap snapshot.
  77   """
  78   assert(isinstance(nheap, native_heap.NativeHeap))
  79
  80   def RadixTreeInsert(node, path):
  81     """Inserts a string (path) into a radix tree (a python recursive dict).
  82
  83     e.g.: [/a/b/c, /a/b/d, /z/h] -> {'/a/b/': {'c': {}, 'd': {}}, '/z/h': {}}
  84     """
  85
  86     def GetCommonPrefix(args):
  87       """Returns the common prefix between two paths (no partial paths).
  88
  89       e.g.: /tmp/bar, /tmp/baz will return /tmp/ (and not /tmp/ba as the dumb
  90       posixpath.commonprefix implementation would do)
  91       """
  92       parts = posixpath.commonprefix(args).rpartition(posixpath.sep)[0]
  93       return parts + posixpath.sep if parts else ''
  94
  95     for node_path in node.iterkeys():
  96       pfx = GetCommonPrefix([node_path, path])
  97       if not pfx:
  98         continue
  99       if len(pfx) < len(node_path):
 100         node[pfx] = {node_path[len(pfx):] : node[node_path]}
 101         del node[node_path]
 102       if len(path) > len(pfx):
 103         RadixTreeInsert(node[pfx], path[len(pfx):])
 104       return
 105     node[path] = {}  # No common prefix, create new child in current node.
 106
 107   # Given an allocation of size N and its stack trace, heuristically determines
 108   # the source directory to be blamed for the N bytes.
 109   # The blamed_dir is the one which appears more times in the top 8 stack frames
 110   # (excluding the first 2, which usually are just the (m|c)alloc call sites).
 111   # At the end, this will generate a *leaderboard* (|blamed_dirs|) which
 112   # associates, to each source path directory, the number of bytes allocated.
 113
 114   blamed_dirs = collections.Counter()  # '/s/path' : bytes_from_this_path (int)
 115   total_allocated = 0
 116   for alloc in nheap.allocations:
 117     dir_histogram = collections.Counter()
 118     for frame in alloc.stack_trace.frames[2:10]:
 119       # Compute a histogram (for each allocation) of the top source dirs.
 120       if not frame.symbol or not frame.symbol.source_info:
 121         continue
 122       src_file = frame.symbol.source_info[0].source_file_path
 123       src_dir = posixpath.dirname(src_file.replace('\\', '/')) + '/'
 124       dir_histogram.update([src_dir])
 125     if not dir_histogram:
 126       continue
 127     # Add the blamed dir to the leaderboard.
 128     blamed_dir = dir_histogram.most_common()[0][0]
 129     blamed_dirs.update({blamed_dir : alloc.size})
 130     total_allocated += alloc.size
 131
 132   # Select only the top paths from the leaderboard which contribute for more
 133   # than |threshold| and make a radix tree out of them.
 134   radix_tree = {}
 135   for blamed_dir, alloc_size in blamed_dirs.most_common():
 136     if (1.0 * alloc_size / total_allocated) < threshold:
 137       break
 138     RadixTreeInsert(radix_tree, blamed_dir)
 139
 140   # The final step consists in generating a rule tree from the radix tree. This
 141   # is a pretty straightforward tree-clone operation, they have the same shape.
 142   def GenRulesFromRadixTree(radix_tree_node, max_depth, parent_path=''):
 143     children = []
 144     if max_depth > 0:
 145       for node_path, node_children in radix_tree_node.iteritems():
 146         child_rule = {
 147             'name': node_path[-16:],
 148             'source_path': '^' + re.escape(parent_path + node_path),
 149             'children': GenRulesFromRadixTree(
 150                 node_children, max_depth - 1, parent_path + node_path)}
 151         children += [child_rule]
 152     return children
 153
 154   rules_tree = GenRulesFromRadixTree(radix_tree, max_depth)
 155   return LoadRules(str(rules_tree))
 156
 157
 158 class _NHeapRule(rules.Rule):
 159   def __init__(self, name, filters):
 160     super(_NHeapRule, self).__init__(name)
 161
 162     # The 'stacktrace' filter can be either a string (simple case, one regex) or
 163     # a list of strings (complex case, see doc in the header of this file).
 164     stacktrace_regexs = filters.get('stacktrace', [])
 165     if isinstance(stacktrace_regexs, basestring):
 166       stacktrace_regexs = [stacktrace_regexs]
 167     self._stacktrace_regexs = []
 168     for regex in stacktrace_regexs:
 169       try:
 170         self._stacktrace_regexs.append(re.compile(regex))
 171       except re.error, descr:
 172         raise exceptions.MemoryInspectorException(
 173             'Stacktrace regex error "%s" : %s' % (regex, descr))
 174
 175     # The 'source_path' regex, instead, simply matches the source file path.
 176     self._path_regex = None
 177     path_regex = filters.get('source_path')
 178     if path_regex:
 179       try:
 180         self._path_regex = re.compile(path_regex)
 181       except re.error, descr:
 182         raise exceptions.MemoryInspectorException(
 183             'Path regex error "%s" : %s' % (path_regex, descr))
 184
 185   def Match(self, allocation):
 186     # Match the source file path, if the 'source_path' filter is specified.
 187     if self._path_regex:
 188       path_matches = False
 189       for frame in allocation.stack_trace.frames:
 190         if frame.symbol and frame.symbol.source_info:
 191           if self._path_regex.search(
 192               frame.symbol.source_info[0].source_file_path):
 193             path_matches = True
 194             break
 195       if not path_matches:
 196         return False
 197
 198     # Match the stack traces symbols, if the 'stacktrace' filter is specified.
 199     if not self._stacktrace_regexs:
 200       return True
 201     cur_regex_idx = 0
 202     cur_regex = self._stacktrace_regexs[0]
 203     for frame in allocation.stack_trace.frames:
 204       if frame.symbol and cur_regex.search(frame.symbol.name):
 205         # The current regex has been matched.
 206         if cur_regex_idx == len(self._stacktrace_regexs) - 1:
 207           return True  # All the provided regexs have been matched, we're happy.
 208         cur_regex_idx += 1
 209         cur_regex = self._stacktrace_regexs[cur_regex_idx]
 210
 211     return False  # Not all the provided regexs have been matched.