src/tools/memory_inspector/memory_inspector/classification/native_heap_classifier.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """This module classifies NativeHeap objects filtering their allocations.
   6
   7 The only filter currently available is 'stacktrace', which works as follows:
   8
   9 {'name': 'rule-1', 'stacktrace': 'foo' }
  10 {'name': 'rule-2', 'stacktrace': ['foo', r'bar\s+baz']}
  11 {'name': 'rule-3', 'source_path': 'sk.*allocator'}
  12 {'name': 'rule-3', 'source_path': 'sk', 'stacktrace': 'SkAllocator'}
  13
  14
  15 rule-1 will match any allocation that has 'foo' in one of its  stack frames.
  16 rule-2 will match any allocation that has a stack frame matching 'foo' AND a
  17 followed by a stack frame matching 'bar baz'. Note that order matters, so rule-2
  18 will not match a stacktrace like ['bar baz', 'foo'].
  19 rule-3 will match any allocation in which at least one of the source paths in
  20 its stack frames matches the regex sk.*allocator.
  21 rule-4 will match any allocation which satisfies both the conditions.
  22
  23 TODO(primiano): introduce more filters after the first prototype with UI, for
  24 instance, filter by library file name or by allocation size.
  25 """
  26
  27 import collections
  28 import posixpath
  29 import re
  30
  31 from memory_inspector.classification import results
  32 from memory_inspector.classification import rules
  33 from memory_inspector.core import exceptions
  34 from memory_inspector.core import native_heap
  35
  36
  37 _RESULT_KEYS = ['bytes_allocated']
  38
  39
  40 def LoadRules(content):
  41   """Loads and parses a native-heap rule tree from a content (string).
  42
  43   Returns:
  44     An instance of |rules.Rule|, which nodes are |_NHeapRule| instances.
  45   """
  46   return rules.Load(content, _NHeapRule)
  47
  48
  49 def Classify(nativeheap, rule_tree):
  50   """Creates aggregated results of native heaps using the provided rules.
  51
  52   Args:
  53     nativeheap: the heap dump being processed (a |NativeHeap| instance).
  54     rule_tree: the user-defined rules that define the filtering categories.
  55
  56   Returns:
  57     An instance of |AggreatedResults|.
  58   """
  59   assert(isinstance(nativeheap, native_heap.NativeHeap))
  60   assert(isinstance(rule_tree, rules.Rule))
  61
  62   res = results.AggreatedResults(rule_tree, _RESULT_KEYS)
  63   for allocation in nativeheap.allocations:
  64     res.AddToMatchingNodes(allocation, [allocation.total_size])
  65   return res
  66
  67
  68 def InferHeuristicRulesFromHeap(nheap, max_depth=3, threshold=0.02):
  69   """Infers the rules tree from a symbolized heap snapshot.
  70
  71   In lack of a specific set of rules, this method can be invoked to infer a
  72   meaningful rule tree starting from a heap snapshot. It will build a compact
  73   radix tree from the source paths of the stack traces, which height is at most
  74   |max_depth|, selecting only those nodes which contribute for at least
  75   |threshold| (1.0 = 100%) w.r.t. the total allocation of the heap snapshot.
  76   """
  77   assert(isinstance(nheap, native_heap.NativeHeap))
  78
  79   def RadixTreeInsert(node, path):
  80     """Inserts a string (path) into a radix tree (a python recursive dict).
  81
  82     e.g.: [/a/b/c, /a/b/d, /z/h] -> {'/a/b/': {'c': {}, 'd': {}}, '/z/h': {}}
  83     """
  84
  85     def GetCommonPrefix(args):
  86       """Returns the common prefix between two paths (no partial paths).
  87
  88       e.g.: /tmp/bar, /tmp/baz will return /tmp/ (and not /tmp/ba as the dumb
  89       posixpath.commonprefix implementation would do)
  90       """
  91       parts = posixpath.commonprefix(args).rpartition(posixpath.sep)[0]
  92       return parts + posixpath.sep if parts else ''
  93
  94     for node_path in node.iterkeys():
  95       pfx = GetCommonPrefix([node_path, path])
  96       if not pfx:
  97         continue
  98       if len(pfx) < len(node_path):
  99         node[pfx] = {node_path[len(pfx):] : node[node_path]}
 100         del node[node_path]
 101       if len(path) > len(pfx):
 102         RadixTreeInsert(node[pfx], path[len(pfx):])
 103       return
 104     node[path] = {}  # No common prefix, create new child in current node.
 105
 106   # Given an allocation of size N and its stack trace, heuristically determines
 107   # the source directory to be blamed for the N bytes.
 108   # The blamed_dir is the one which appears more times in the top 8 stack frames
 109   # (excluding the first 2, which usually are just the (m|c)alloc call sites).
 110   # At the end, this will generate a *leaderboard* (|blamed_dirs|) which
 111   # associates, to each source path directory, the number of bytes allocated.
 112
 113   blamed_dirs = collections.Counter()  # '/s/path' : bytes_from_this_path (int)
 114   total_allocated = 0
 115   for alloc in nheap.allocations:
 116     dir_histogram = collections.Counter()
 117     for frame in alloc.stack_trace.frames[2:10]:
 118       # Compute a histogram (for each allocation) of the top source dirs.
 119       if not frame.symbol or not frame.symbol.source_info:
 120         continue
 121       src_file = frame.symbol.source_info[0].source_file_path
 122       src_dir = posixpath.dirname(src_file.replace('\\', '/')) + '/'
 123       dir_histogram.update([src_dir])
 124     if not dir_histogram:
 125       continue
 126     # Add the blamed dir to the leaderboard.
 127     blamed_dir = dir_histogram.most_common()[0][0]
 128     blamed_dirs.update({blamed_dir : alloc.total_size})
 129     total_allocated += alloc.total_size
 130
 131   # Select only the top paths from the leaderboard which contribute for more
 132   # than |threshold| and make a radix tree out of them.
 133   radix_tree = {}
 134   for blamed_dir, alloc_size in blamed_dirs.most_common():
 135     if (1.0 * alloc_size / total_allocated) < threshold:
 136       break
 137     RadixTreeInsert(radix_tree, blamed_dir)
 138
 139   # The final step consists in generating a rule tree from the radix tree. This
 140   # is a pretty straightforward tree-clone operation, they have the same shape.
 141   def GenRulesFromRadixTree(radix_tree_node, max_depth, parent_path=''):
 142     children = []
 143     if max_depth > 0:
 144       for node_path, node_children in radix_tree_node.iteritems():
 145         child_rule = {
 146             'name': node_path[-16:],
 147             'source_path': '^' + re.escape(parent_path + node_path),
 148             'children': GenRulesFromRadixTree(
 149                 node_children, max_depth - 1, parent_path + node_path)}
 150         children += [child_rule]
 151     return children
 152
 153   rules_tree = GenRulesFromRadixTree(radix_tree, max_depth)
 154   return LoadRules(str(rules_tree))
 155
 156
 157 class _NHeapRule(rules.Rule):
 158   def __init__(self, name, filters):
 159     super(_NHeapRule, self).__init__(name)
 160
 161     # The 'stacktrace' filter can be either a string (simple case, one regex) or
 162     # a list of strings (complex case, see doc in the header of this file).
 163     stacktrace_regexs = filters.get('stacktrace', [])
 164     if isinstance(stacktrace_regexs, basestring):
 165       stacktrace_regexs = [stacktrace_regexs]
 166     self._stacktrace_regexs = []
 167     for regex in stacktrace_regexs:
 168       try:
 169         self._stacktrace_regexs.append(re.compile(regex))
 170       except re.error, descr:
 171         raise exceptions.MemoryInspectorException(
 172             'Stacktrace regex error "%s" : %s' % (regex, descr))
 173
 174     # The 'source_path' regex, instead, simply matches the source file path.
 175     self._path_regex = None
 176     path_regex = filters.get('source_path')
 177     if path_regex:
 178       try:
 179         self._path_regex = re.compile(path_regex)
 180       except re.error, descr:
 181         raise exceptions.MemoryInspectorException(
 182             'Path regex error "%s" : %s' % (path_regex, descr))
 183
 184   def Match(self, allocation):
 185     # Match the source file path, if the 'source_path' filter is specified.
 186     if self._path_regex:
 187       path_matches = False
 188       for frame in allocation.stack_trace.frames:
 189         if frame.symbol and frame.symbol.source_info:
 190           if self._path_regex.search(
 191               frame.symbol.source_info[0].source_file_path):
 192             path_matches = True
 193             break
 194       if not path_matches:
 195         return False
 196
 197     # Match the stack traces symbols, if the 'stacktrace' filter is specified.
 198     if not self._stacktrace_regexs:
 199       return True
 200     cur_regex_idx = 0
 201     cur_regex = self._stacktrace_regexs[0]
 202     for frame in allocation.stack_trace.frames:
 203       if frame.symbol and cur_regex.search(frame.symbol.name):
 204         # The current regex has been matched.
 205         if cur_regex_idx == len(self._stacktrace_regexs) - 1:
 206           return True  # All the provided regexs have been matched, we're happy.
 207         cur_regex_idx += 1
 208         cur_regex = self._stacktrace_regexs[cur_regex_idx]
 209
 210     return False  # Not all the provided regexs have been matched.