src/tools/swarming_client/trace_inputs.py

   1 #!/usr/bin/env python
   2 # coding=utf-8
   3 # Copyright 2012 The Swarming Authors. All rights reserved.
   4 # Use of this source code is governed under the Apache License, Version 2.0 that
   5 # can be found in the LICENSE file.
   6
   7 """Traces an executable and its child processes and extract the files accessed
   8 by them.
   9
  10 The implementation uses OS-specific API. The native Kernel logger and the ETL
  11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
  12 The OS-specific implementation is hidden in an 'API' interface.
  13
  14 The results are embedded in a Results instance. The tracing is done in two
  15 phases, the first is to do the actual trace and generate an
  16 implementation-specific log file. Then the log file is parsed to extract the
  17 information, including the individual child processes and the files accessed
  18 from the log.
  19 """
  20
  21 import codecs
  22 import csv
  23 import errno
  24 import getpass
  25 import glob
  26 import logging
  27 import os
  28 import re
  29 import stat
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import threading
  34 import time
  35 import weakref
  36
  37 from third_party import colorama
  38 from third_party.depot_tools import fix_encoding
  39 from third_party.depot_tools import subcommand
  40
  41 from utils import file_path
  42 from utils import tools
  43
  44 ## OS-specific imports
  45
  46 if sys.platform == 'win32':
  47   from ctypes.wintypes import byref, c_int, c_wchar_p
  48   from ctypes.wintypes import windll # pylint: disable=E0611
  49
  50
  51 __version__ = '0.1'
  52
  53
  54 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
  55 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
  56
  57
  58 class TracingFailure(Exception):
  59   """An exception occured during tracing."""
  60   def __init__(self, description, pid, line_number, line, *args):
  61     super(TracingFailure, self).__init__(
  62         description, pid, line_number, line, *args)
  63     self.description = description
  64     self.pid = pid
  65     self.line_number = line_number
  66     self.line = line
  67     self.extra = args
  68
  69   def __str__(self):
  70     out = self.description
  71     if self.pid:
  72       out += '\npid: %d' % self.pid
  73     if self.line_number:
  74       out += '\nline: %d' % self.line_number
  75     if self.line:
  76       out += '\n%s' % self.line
  77     if self.extra:
  78       out += '\n' + ', '.join(map(str, filter(None, self.extra)))
  79     return out
  80
  81
  82 ## OS-specific functions
  83
  84 if sys.platform == 'win32':
  85   def get_current_encoding():
  86     """Returns the 'ANSI' code page associated to the process."""
  87     return 'cp%d' % int(windll.kernel32.GetACP())
  88
  89
  90   def CommandLineToArgvW(command_line):
  91     """Splits a commandline into argv using CommandLineToArgvW()."""
  92     # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
  93     size = c_int()
  94     assert isinstance(command_line, unicode)
  95     ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
  96     try:
  97       return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
  98     finally:
  99       windll.kernel32.LocalFree(ptr)
 100
 101
 102 def can_trace():
 103   """Returns True if the user is an administrator on Windows.
 104
 105   It is required for tracing to work.
 106   """
 107   if sys.platform != 'win32':
 108     return True
 109   return bool(windll.shell32.IsUserAnAdmin())
 110
 111
 112 def create_subprocess_thunk():
 113   """Creates a small temporary script to start the child process.
 114
 115   This thunk doesn't block, its unique name is used to identify it as the
 116   parent.
 117   """
 118   handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
 119   try:
 120     os.write(
 121         handle,
 122         (
 123           'import subprocess, sys\n'
 124           'sys.exit(subprocess.call(sys.argv[2:]))\n'
 125         ))
 126   finally:
 127     os.close(handle)
 128   return name
 129
 130
 131 def create_exec_thunk():
 132   """Creates a small temporary script to start the child executable.
 133
 134   Reads from the file handle provided as the fisrt argument to block, then
 135   execv() the command to be traced.
 136   """
 137   handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
 138   try:
 139     os.write(
 140         handle,
 141         (
 142           'import os, sys\n'
 143           'fd = int(sys.argv[1])\n'
 144           # This will block until the controlling process writes a byte on the
 145           # pipe. It will do so once the tracing tool, e.g. strace, is ready to
 146           # trace.
 147           'os.read(fd, 1)\n'
 148           'os.close(fd)\n'
 149           'os.execv(sys.argv[2], sys.argv[2:])\n'
 150         ))
 151   finally:
 152     os.close(handle)
 153   return name
 154
 155
 156 def strace_process_quoted_arguments(text):
 157   """Extracts quoted arguments on a string and return the arguments as a list.
 158
 159   Implemented as an automaton. Supports incomplete strings in the form
 160   '"foo"...'.
 161
 162   Example:
 163     With text = '"foo", "bar"', the function will return ['foo', 'bar']
 164
 165   TODO(maruel): Implement escaping.
 166   """
 167   # All the possible states of the DFA.
 168   ( NEED_QUOTE,         # Begining of a new arguments.
 169     INSIDE_STRING,      # Inside an argument.
 170     ESCAPED,            # Found a '\' inside a quote. Treat the next char as-is.
 171     NEED_COMMA_OR_DOT,  # Right after the closing quote of an argument. Could be
 172                         # a serie of 3 dots or a comma.
 173     NEED_SPACE,         # Right after a comma
 174     NEED_DOT_2,         # Found a dot, need a second one.
 175     NEED_DOT_3,         # Found second dot, need a third one.
 176     NEED_COMMA,         # Found third dot, need a comma.
 177     ) = range(8)
 178
 179   state = NEED_QUOTE
 180   out = []
 181   for index, char in enumerate(text):
 182     if char == '"':
 183       if state == NEED_QUOTE:
 184         state = INSIDE_STRING
 185         # A new argument was found.
 186         out.append('')
 187       elif state == INSIDE_STRING:
 188         # The argument is now closed.
 189         state = NEED_COMMA_OR_DOT
 190       elif state == ESCAPED:
 191         out[-1] += char
 192         state = INSIDE_STRING
 193       else:
 194         raise ValueError(
 195             'Can\'t process char \'%s\' at column %d for: %r' % (
 196               char, index, text),
 197             index,
 198             text)
 199     elif char == ',':
 200       if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
 201         state = NEED_SPACE
 202       elif state == INSIDE_STRING:
 203         out[-1] += char
 204       elif state == ESCAPED:
 205         out[-1] += char
 206         state = INSIDE_STRING
 207       else:
 208         raise ValueError(
 209             'Can\'t process char \'%s\' at column %d for: %r' % (
 210               char, index, text),
 211             index,
 212             text)
 213     elif char == ' ':
 214       if state == NEED_SPACE:
 215         state = NEED_QUOTE
 216       elif state == INSIDE_STRING:
 217         out[-1] += char
 218       elif state == ESCAPED:
 219         out[-1] += char
 220         state = INSIDE_STRING
 221       else:
 222         raise ValueError(
 223             'Can\'t process char \'%s\' at column %d for: %r' % (
 224               char, index, text),
 225             index,
 226             text)
 227     elif char == '.':
 228       if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
 229         # The string is incomplete, this mean the strace -s flag should be
 230         # increased.
 231         # For NEED_QUOTE, the input string would look like '"foo", ...'.
 232         # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
 233         state = NEED_DOT_2
 234       elif state == NEED_DOT_2:
 235         state = NEED_DOT_3
 236       elif state == NEED_DOT_3:
 237         state = NEED_COMMA
 238       elif state == INSIDE_STRING:
 239         out[-1] += char
 240       elif state == ESCAPED:
 241         out[-1] += char
 242         state = INSIDE_STRING
 243       else:
 244         raise ValueError(
 245             'Can\'t process char \'%s\' at column %d for: %r' % (
 246               char, index, text),
 247             index,
 248             text)
 249     elif char == '\\':
 250       if state == ESCAPED:
 251         out[-1] += char
 252         state = INSIDE_STRING
 253       elif state == INSIDE_STRING:
 254         state = ESCAPED
 255       else:
 256         raise ValueError(
 257             'Can\'t process char \'%s\' at column %d for: %r' % (
 258               char, index, text),
 259             index,
 260             text)
 261     else:
 262       if state == INSIDE_STRING:
 263         out[-1] += char
 264       else:
 265         raise ValueError(
 266             'Can\'t process char \'%s\' at column %d for: %r' % (
 267               char, index, text),
 268             index,
 269             text)
 270   if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
 271     raise ValueError(
 272         'String is incorrectly terminated: %r' % text,
 273         text)
 274   return out
 275
 276
 277 def assert_is_renderable(pseudo_string):
 278   """Asserts the input is a valid object to be processed by render()."""
 279   assert (
 280       pseudo_string is None or
 281       isinstance(pseudo_string, unicode) or
 282       hasattr(pseudo_string, 'render')), repr(pseudo_string)
 283
 284
 285 def render(pseudo_string):
 286   """Converts the pseudo-string to an unicode string."""
 287   if pseudo_string is None or isinstance(pseudo_string, unicode):
 288     return pseudo_string
 289   return pseudo_string.render()
 290
 291
 292 class Results(object):
 293   """Results of a trace session."""
 294
 295   class _TouchedObject(object):
 296     """Something, a file or a directory, that was accessed."""
 297     def __init__(self, root, path, tainted, size, nb_files):
 298       logging.debug(
 299           '%s(%s, %s, %s, %s, %s)' %
 300           (self.__class__.__name__, root, path, tainted, size, nb_files))
 301       assert_is_renderable(root)
 302       assert_is_renderable(path)
 303       self.root = root
 304       self.path = path
 305       self.tainted = tainted
 306       self.nb_files = nb_files
 307       # Can be used as a cache or a default value, depending on context. In
 308       # particular, once self.tainted is True, because the path was replaced
 309       # with a variable, it is not possible to look up the file size.
 310       self._size = size
 311       # These are cache only.
 312       self._real_path = None
 313
 314       # Check internal consistency.
 315       assert path, path
 316       assert tainted or bool(root) != bool(file_path.isabs(path)), (root, path)
 317       assert tainted or (
 318           not os.path.exists(self.full_path) or
 319           (self.full_path == file_path.get_native_path_case(self.full_path))), (
 320               tainted,
 321               self.full_path,
 322               file_path.get_native_path_case(self.full_path))
 323
 324     @property
 325     def existent(self):
 326       return self.size != -1
 327
 328     @property
 329     def full_path(self):
 330       if self.root:
 331         return os.path.join(self.root, self.path)
 332       return self.path
 333
 334     @property
 335     def real_path(self):
 336       """Returns the path with symlinks resolved."""
 337       if not self._real_path:
 338         self._real_path = os.path.realpath(self.full_path)
 339       return self._real_path
 340
 341     @property
 342     def size(self):
 343       """File's size. -1 is not existent.
 344
 345       Once tainted, it is not possible the retrieve the file size anymore since
 346       the path is composed of variables.
 347       """
 348       if self._size is None and not self.tainted:
 349         try:
 350           self._size = os.stat(self.full_path).st_size
 351         except OSError:
 352           self._size = -1
 353       return self._size
 354
 355     def flatten(self):
 356       """Returns a dict representing this object.
 357
 358       A 'size' of 0 means the file was only touched and not read.
 359       """
 360       return {
 361         'path': self.path,
 362         'size': self.size,
 363       }
 364
 365     def replace_variables(self, variables):
 366       """Replaces the root of this File with one of the variables if it matches.
 367
 368       If a variable replacement occurs, the cloned object becomes tainted.
 369       """
 370       for variable, root_path in variables.iteritems():
 371         if self.path.startswith(root_path):
 372           return self._clone(
 373               self.root, variable + self.path[len(root_path):], True)
 374       # No need to clone, returns ourself.
 375       return self
 376
 377     def strip_root(self, root):
 378       """Returns a clone of itself with 'root' stripped off.
 379
 380       Note that the file is kept if it is either accessible from a symlinked
 381       path that was used to access the file or through the real path.
 382       """
 383       # Check internal consistency.
 384       assert (
 385           self.tainted or
 386           (file_path.isabs(root) and root.endswith(os.path.sep))), root
 387       if not self.full_path.startswith(root):
 388         # Now try to resolve the symlinks to see if it can be reached this way.
 389         # Only try *after* trying without resolving symlink.
 390         if not self.real_path.startswith(root):
 391           return None
 392         path = self.real_path
 393       else:
 394         path = self.full_path
 395       return self._clone(root, path[len(root):], self.tainted)
 396
 397     def _clone(self, new_root, new_path, tainted):
 398       raise NotImplementedError(self.__class__.__name__)
 399
 400   class File(_TouchedObject):
 401     """A file that was accessed. May not be present anymore.
 402
 403     If tainted is true, it means it is not a real path anymore as a variable
 404     replacement occured.
 405
 406     |mode| can be one of None, TOUCHED, READ or WRITE.
 407     """
 408     # Was probed for existence, and it is existent, but was never _opened_.
 409     TOUCHED = 't'
 410     # Opened for read only and guaranteed to not have been written to.
 411     READ = 'r'
 412     # Opened for write.
 413     WRITE = 'w'
 414
 415     # They are listed in order of priority. E.g. if a file is traced as TOUCHED
 416     # then as WRITE, only keep WRITE. None means no idea, which is a problem on
 417     # Windows.
 418     ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE)
 419
 420     def __init__(self, root, path, tainted, size, mode):
 421       assert mode in self.ACCEPTABLE_MODES
 422       super(Results.File, self).__init__(root, path, tainted, size, 1)
 423       self.mode = mode
 424
 425     def _clone(self, new_root, new_path, tainted):
 426       """Clones itself keeping meta-data."""
 427       # Keep the self.size and self._real_path caches for performance reason. It
 428       # is also important when the file becomes tainted (with a variable instead
 429       # of the real path) since self.path is not an on-disk path anymore so
 430       # out._size cannot be updated.
 431       out = self.__class__(new_root, new_path, tainted, self.size, self.mode)
 432       out._real_path = self._real_path
 433       return out
 434
 435     def flatten(self):
 436       out = super(Results.File, self).flatten()
 437       out['mode'] = self.mode
 438       return out
 439
 440   class Directory(_TouchedObject):
 441     """A directory of files. Must exist.
 442
 443     For a Directory instance, self.size is not a cache, it's an actual value
 444     that is never modified and represents the total size of the files contained
 445     in this directory. It is possible that the directory is empty so that
 446     size==0; this happens if there's only an invalid symlink in it.
 447     """
 448     def __init__(self, root, path, tainted, size, nb_files):
 449       """path='.' is a valid value and must be handled appropriately."""
 450       assert not path.endswith(os.path.sep), path
 451       super(Results.Directory, self).__init__(
 452           root, path + os.path.sep, tainted, size, nb_files)
 453
 454     def flatten(self):
 455       out = super(Results.Directory, self).flatten()
 456       out['nb_files'] = self.nb_files
 457       return out
 458
 459     def _clone(self, new_root, new_path, tainted):
 460       """Clones itself keeping meta-data."""
 461       out = self.__class__(
 462           new_root,
 463           new_path.rstrip(os.path.sep),
 464           tainted,
 465           self.size,
 466           self.nb_files)
 467       out._real_path = self._real_path
 468       return out
 469
 470   class Process(object):
 471     """A process that was traced.
 472
 473     Contains references to the files accessed by this process and its children.
 474     """
 475     def __init__(self, pid, files, executable, command, initial_cwd, children):
 476       logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
 477       self.pid = pid
 478       self.files = sorted(files, key=lambda x: x.path)
 479       self.children = children
 480       self.executable = executable
 481       self.command = command
 482       self.initial_cwd = initial_cwd
 483
 484       # Check internal consistency.
 485       assert len(set(f.path for f in self.files)) == len(self.files), sorted(
 486           f.path for f in self.files)
 487       assert isinstance(self.children, list)
 488       assert isinstance(self.files, list)
 489
 490     @property
 491     def all(self):
 492       for child in self.children:
 493         for i in child.all:
 494           yield i
 495       yield self
 496
 497     def flatten(self):
 498       return {
 499         'children': [c.flatten() for c in self.children],
 500         'command': self.command,
 501         'executable': self.executable,
 502         'files': [f.flatten() for f in self.files],
 503         'initial_cwd': self.initial_cwd,
 504         'pid': self.pid,
 505       }
 506
 507     def strip_root(self, root):
 508       assert file_path.isabs(root) and root.endswith(os.path.sep), root
 509       # Loads the files after since they are constructed as objects.
 510       out = self.__class__(
 511           self.pid,
 512           filter(None, (f.strip_root(root) for f in self.files)),
 513           self.executable,
 514           self.command,
 515           self.initial_cwd,
 516           [c.strip_root(root) for c in self.children])
 517       logging.debug(
 518           'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
 519       return out
 520
 521   def __init__(self, process):
 522     self.process = process
 523     # Cache.
 524     self._files = None
 525
 526   def flatten(self):
 527     return {
 528       'root': self.process.flatten(),
 529     }
 530
 531   @property
 532   def files(self):
 533     if self._files is None:
 534       self._files = sorted(
 535           sum((p.files for p in self.process.all), []),
 536           key=lambda x: x.path)
 537     return self._files
 538
 539   @property
 540   def existent(self):
 541     return [f for f in self.files if f.existent]
 542
 543   @property
 544   def non_existent(self):
 545     return [f for f in self.files if not f.existent]
 546
 547   def strip_root(self, root):
 548     """Returns a clone with all the files outside the directory |root| removed
 549     and converts all the path to be relative paths.
 550
 551     It keeps files accessible through the |root| directory or that have been
 552     accessed through any symlink which points to the same directory.
 553     """
 554     # Resolve any symlink
 555     root = os.path.realpath(root)
 556     root = (
 557         file_path.get_native_path_case(root).rstrip(os.path.sep) + os.path.sep)
 558     logging.debug('strip_root(%s)' % root)
 559     return Results(self.process.strip_root(root))
 560
 561
 562 class ApiBase(object):
 563   """OS-agnostic API to trace a process and its children."""
 564   class Context(object):
 565     """Processes one log line at a time and keeps the list of traced processes.
 566
 567     The parsing is complicated by the fact that logs are traced out of order for
 568     strace but in-order for dtrace and logman. In addition, on Windows it is
 569     very frequent that processids are reused so a flat list cannot be used. But
 570     at the same time, it is impossible to faithfully construct a graph when the
 571     logs are processed out of order. So both a tree and a flat mapping are used,
 572     the tree is the real process tree, while the flat mapping stores the last
 573     valid process for the corresponding processid. For the strace case, the
 574     tree's head is guessed at the last moment.
 575     """
 576     class Process(object):
 577       """Keeps context for one traced child process.
 578
 579       Logs all the files this process touched. Ignores directories.
 580       """
 581       def __init__(self, blacklist, pid, initial_cwd):
 582         # Check internal consistency.
 583         assert isinstance(pid, int), repr(pid)
 584         assert_is_renderable(initial_cwd)
 585         self.pid = pid
 586         # children are Process instances.
 587         self.children = []
 588         self.initial_cwd = initial_cwd
 589         self.cwd = None
 590         self.files = {}
 591         self.executable = None
 592         self.command = None
 593         self._blacklist = blacklist
 594
 595       def to_results_process(self):
 596         """Resolves file case sensitivity and or late-bound strings."""
 597         # When resolving files, it's normal to get dupe because a file could be
 598         # opened multiple times with different case. Resolve the deduplication
 599         # here.
 600         def fix_path(x):
 601           """Returns the native file path case.
 602
 603           Converts late-bound strings.
 604           """
 605           if not x:
 606             # Do not convert None instance to 'None'.
 607             return x
 608           x = render(x)
 609           if os.path.isabs(x):
 610             # If the path is not absolute, which tends to happen occasionally on
 611             # Windows, it is not possible to get the native path case so ignore
 612             # that trace. It mostly happens for 'executable' value.
 613             x = file_path.get_native_path_case(x)
 614           return x
 615
 616         def fix_and_blacklist_path(x, m):
 617           """Receives a tuple (filepath, mode) and processes filepath."""
 618           x = fix_path(x)
 619           if not x:
 620             return
 621           # The blacklist needs to be reapplied, since path casing could
 622           # influence blacklisting.
 623           if self._blacklist(x):
 624             return
 625           # Filters out directories. Some may have passed through.
 626           if os.path.isdir(x):
 627             return
 628           return x, m
 629
 630         # Renders all the files as strings, as some could be RelativePath
 631         # instances. It is important to do it first since there could still be
 632         # multiple entries with the same path but different modes.
 633         rendered = (
 634             fix_and_blacklist_path(f, m) for f, m in self.files.iteritems())
 635         files = sorted(
 636           (f for f in rendered if f),
 637           key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1])))
 638         # Then converting into a dict will automatically clean up lesser
 639         # important values.
 640         files = [
 641           Results.File(None, f, False, None, m)
 642           for f, m in dict(files).iteritems()
 643         ]
 644         return Results.Process(
 645             self.pid,
 646             files,
 647             fix_path(self.executable),
 648             self.command,
 649             fix_path(self.initial_cwd),
 650             [c.to_results_process() for c in self.children])
 651
 652       def add_file(self, filepath, mode):
 653         """Adds a file if it passes the blacklist."""
 654         if self._blacklist(render(filepath)):
 655           return
 656         logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode)
 657         # Note that filepath and not render(filepath) is added. It is because
 658         # filepath could be something else than a string, like a RelativePath
 659         # instance for dtrace logs.
 660         modes = Results.File.ACCEPTABLE_MODES
 661         old_mode = self.files.setdefault(filepath, mode)
 662         if old_mode != mode and modes.index(old_mode) < modes.index(mode):
 663           # Take the highest value.
 664           self.files[filepath] = mode
 665
 666     def __init__(self, blacklist):
 667       self.blacklist = blacklist
 668       # Initial process.
 669       self.root_process = None
 670       # dict to accelerate process lookup, to not have to lookup the whole graph
 671       # each time.
 672       self._process_lookup = {}
 673
 674   class Tracer(object):
 675     """During it's lifetime, the tracing subsystem is enabled."""
 676     def __init__(self, logname):
 677       self._logname = logname
 678       self._lock = threading.RLock()
 679       self._traces = []
 680       self._initialized = True
 681       self._scripts_to_cleanup = []
 682
 683     def trace(self, cmd, cwd, tracename, output):
 684       """Runs the OS-specific trace program on an executable.
 685
 686       Arguments:
 687       - cmd: The command (a list) to run.
 688       - cwd: Current directory to start the child process in.
 689       - tracename: Name of the trace in the logname file.
 690       - output: If False, redirects output to PIPEs.
 691
 692       Returns a tuple (resultcode, output) and updates the internal trace
 693       entries.
 694       """
 695       # The implementation adds an item to self._traces.
 696       raise NotImplementedError(self.__class__.__name__)
 697
 698     def close(self, _timeout=None):
 699       """Saves the meta-data in the logname file.
 700
 701       For kernel-based tracing, stops the tracing subsystem.
 702
 703       Must not be used manually when using 'with' construct.
 704       """
 705       with self._lock:
 706         if not self._initialized:
 707           raise TracingFailure(
 708               'Called %s.close() on an unitialized object' %
 709                   self.__class__.__name__,
 710               None, None, None)
 711         try:
 712           while self._scripts_to_cleanup:
 713             try:
 714               os.remove(self._scripts_to_cleanup.pop())
 715             except OSError as e:
 716               logging.error('Failed to delete a temporary script: %s', e)
 717           tools.write_json(self._logname, self._gen_logdata(), False)
 718         finally:
 719           self._initialized = False
 720
 721     def post_process_log(self):
 722       """Post-processes the log so it becomes faster to load afterward.
 723
 724       Must not be used manually when using 'with' construct.
 725       """
 726       assert not self._initialized, 'Must stop tracing first.'
 727
 728     def _gen_logdata(self):
 729       """Returns the data to be saved in the trace file."""
 730       return  {
 731         'traces': self._traces,
 732       }
 733
 734     def __enter__(self):
 735       """Enables 'with' statement."""
 736       return self
 737
 738     def __exit__(self, exc_type, exc_value, traceback):
 739       """Enables 'with' statement."""
 740       self.close()
 741       # If an exception was thrown, do not process logs.
 742       if not exc_type:
 743         self.post_process_log()
 744
 745   def get_tracer(self, logname):
 746     """Returns an ApiBase.Tracer instance.
 747
 748     Initializes the tracing subsystem, which is a requirement for kernel-based
 749     tracers. Only one tracer instance should be live at a time!
 750
 751     logname is the filepath to the json file that will contain the meta-data
 752     about the logs.
 753     """
 754     return self.Tracer(logname)
 755
 756   @staticmethod
 757   def clean_trace(logname):
 758     """Deletes an old log."""
 759     raise NotImplementedError()
 760
 761   @classmethod
 762   def parse_log(cls, logname, blacklist, trace_name):
 763     """Processes trace logs and returns the files opened and the files that do
 764     not exist.
 765
 766     It does not track directories.
 767
 768     Arguments:
 769       - logname: must be an absolute path.
 770       - blacklist: must be a lambda.
 771       - trace_name: optional trace to read, defaults to reading all traces.
 772
 773     Most of the time, files that do not exist are temporary test files that
 774     should be put in /tmp instead. See http://crbug.com/116251.
 775
 776     Returns a list of dict with keys:
 777       - results: A Results instance.
 778       - trace: The corresponding tracename parameter provided to
 779                get_tracer().trace().
 780       - output: Output gathered during execution, if get_tracer().trace(...,
 781                 output=False) was used.
 782     """
 783     raise NotImplementedError(cls.__class__.__name__)
 784
 785
 786 class Strace(ApiBase):
 787   """strace implies linux."""
 788   @staticmethod
 789   def load_filename(filename):
 790     """Parses a filename in a log."""
 791     # TODO(maruel): Be compatible with strace -x.
 792     assert isinstance(filename, str)
 793     out = ''
 794     i = 0
 795     while i < len(filename):
 796       c = filename[i]
 797       if c == '\\':
 798         out += chr(int(filename[i+1:i+4], 8))
 799         i += 4
 800       else:
 801         out += c
 802         i += 1
 803     # TODO(maruel): That's not necessarily true that the current code page is
 804     # utf-8.
 805     return out.decode('utf-8')
 806
 807   class Context(ApiBase.Context):
 808     """Processes a strace log line and keeps the list of existent and non
 809     existent files accessed.
 810
 811     Ignores directories.
 812
 813     Uses late-binding to processes the cwd of each process. The problem is that
 814     strace generates one log file per process it traced but doesn't give any
 815     information about which process was started when and by who. So process the
 816     logs out of order and use late binding with RelativePath to be able to
 817     deduce the initial directory of each process once all the logs are parsed.
 818
 819     TODO(maruel): Use the script even in the non-sudo case, so log parsing can
 820     be done in two phase: first find the root process, then process the child
 821     processes in order. With that, it should be possible to not use RelativePath
 822     anymore. This would significantly simplify the code!
 823     """
 824     class Process(ApiBase.Context.Process):
 825       """Represents the state of a process.
 826
 827       Contains all the information retrieved from the pid-specific log.
 828       """
 829       # Function names are using ([a-z_0-9]+)
 830       # This is the most common format. function(args) = result
 831       RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
 832       # An interrupted function call, only grab the minimal header.
 833       RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
 834       # A resumed function call.
 835       RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
 836       # A process received a signal.
 837       RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
 838       # A process didn't handle a signal. Ignore any junk appearing before,
 839       # because the process was forcibly killed so it won't open any new file.
 840       RE_KILLED = re.compile(
 841           r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
 842       # The process has exited.
 843       RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
 844       # A call was canceled. Ignore any prefix.
 845       RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
 846       # Happens when strace fails to even get the function name.
 847       UNNAMED_FUNCTION = '????'
 848
 849       # Corner-case in python, a class member function decorator must not be
 850       # @staticmethod.
 851       def parse_args(regexp, expect_zero):  # pylint: disable=E0213
 852         """Automatically convert the str 'args' into a list of processed
 853         arguments.
 854
 855         Arguments:
 856         - regexp is used to parse args.
 857         - expect_zero: one of True, False or None.
 858           - True: will check for result.startswith('0') first and will ignore
 859             the trace line completely otherwise. This is important because for
 860             many functions, the regexp will not process if the call failed.
 861           - False: will check for not result.startswith(('?', '-1')) for the
 862             same reason than with True.
 863           - None: ignore result.
 864         """
 865         def meta_hook(function):
 866           assert function.__name__.startswith('handle_')
 867           def hook(self, args, result):
 868             if expect_zero is True and not result.startswith('0'):
 869               return
 870             if expect_zero is False and result.startswith(('?', '-1')):
 871               return
 872             match = re.match(regexp, args)
 873             if not match:
 874               raise TracingFailure(
 875                   'Failed to parse %s(%s) = %s' %
 876                   (function.__name__[len('handle_'):], args, result),
 877                   None, None, None)
 878             return function(self, match.groups(), result)
 879           return hook
 880         return meta_hook
 881
 882       class RelativePath(object):
 883         """A late-bound relative path."""
 884         def __init__(self, parent, value):
 885           assert_is_renderable(parent)
 886           self.parent = parent
 887           assert (
 888               value is None or
 889               (isinstance(value, unicode) and not os.path.isabs(value)))
 890           self.value = value
 891           if self.value:
 892             # TODO(maruel): On POSIX, '\\' is a valid character so remove this
 893             # assert.
 894             assert '\\' not in self.value, value
 895             assert '\\' not in self.value, (repr(value), repr(self.value))
 896
 897         def render(self):
 898           """Returns the current directory this instance is representing.
 899
 900           This function is used to return the late-bound value.
 901           """
 902           assert self.parent is not None
 903           parent = render(self.parent)
 904           if self.value:
 905             return os.path.normpath(os.path.join(parent, self.value))
 906           return parent
 907
 908       def __init__(self, root, pid):
 909         """Keeps enough information to be able to guess the original process
 910         root.
 911
 912         strace doesn't store which process was the initial process. So more
 913         information needs to be kept so the graph can be reconstructed from the
 914         flat map.
 915         """
 916         logging.info('%s(%d)' % (self.__class__.__name__, pid))
 917         super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
 918         assert isinstance(root, ApiBase.Context)
 919         self._root = weakref.ref(root)
 920         # The dict key is the function name of the pending call, like 'open'
 921         # or 'execve'.
 922         self._pending_calls = {}
 923         self._line_number = 0
 924         # Current directory when the process started.
 925         if isinstance(self._root(), unicode):
 926           self.initial_cwd = self._root()
 927         else:
 928           self.initial_cwd = self.RelativePath(self._root(), None)
 929         self.parentid = None
 930         self._done = False
 931
 932       def get_cwd(self):
 933         """Returns the best known value of cwd."""
 934         return self.cwd or self.initial_cwd
 935
 936       def render(self):
 937         """Returns the string value of the RelativePath() object.
 938
 939         Used by RelativePath. Returns the initial directory and not the
 940         current one since the current directory 'cwd' validity is time-limited.
 941
 942         The validity is only guaranteed once all the logs are processed.
 943         """
 944         return self.initial_cwd.render()
 945
 946       def on_line(self, line):
 947         assert isinstance(line, str)
 948         self._line_number += 1
 949         try:
 950           if self._done:
 951             raise TracingFailure(
 952                 'Found a trace for a terminated process or corrupted log',
 953                 None, None, None)
 954
 955           if self.RE_SIGNAL.match(line):
 956             # Ignore signals.
 957             return
 958
 959           match = self.RE_KILLED.match(line)
 960           if match:
 961             # Converts a '+++ killed by Foo +++' trace into an exit_group().
 962             self.handle_exit_group(match.group(1), None)
 963             return
 964
 965           match = self.RE_PROCESS_EXITED.match(line)
 966           if match:
 967             # Converts a '+++ exited with 1 +++' trace into an exit_group()
 968             self.handle_exit_group(match.group(1), None)
 969             return
 970
 971           match = self.RE_UNFINISHED.match(line)
 972           if match:
 973             if match.group(1) in self._pending_calls:
 974               raise TracingFailure(
 975                   'Found two unfinished calls for the same function',
 976                   None, None, None,
 977                   self._pending_calls)
 978             self._pending_calls[match.group(1)] = (
 979                 match.group(1) + match.group(2))
 980             return
 981
 982           match = self.RE_UNAVAILABLE.match(line)
 983           if match:
 984             # This usually means a process was killed and a pending call was
 985             # canceled.
 986             # TODO(maruel): Look up the last exit_group() trace just above and
 987             # make sure any self._pending_calls[anything] is properly flushed.
 988             return
 989
 990           match = self.RE_RESUMED.match(line)
 991           if match:
 992             if match.group(1) not in self._pending_calls:
 993               raise TracingFailure(
 994                   'Found a resumed call that was not logged as unfinished',
 995                   None, None, None,
 996                   self._pending_calls)
 997             pending = self._pending_calls.pop(match.group(1))
 998             # Reconstruct the line.
 999             line = pending + match.group(2)
1000
1001           match = self.RE_HEADER.match(line)
1002           if not match:
1003             # The line is corrupted. It happens occasionally when a process is
1004             # killed forcibly with activity going on. Assume the process died.
1005             # No other line can be processed afterward.
1006             logging.debug('%d is done: %s', self.pid, line)
1007             self._done = True
1008             return
1009
1010           if match.group(1) == self.UNNAMED_FUNCTION:
1011             return
1012
1013           # It's a valid line, handle it.
1014           handler = getattr(self, 'handle_%s' % match.group(1), None)
1015           if not handler:
1016             self._handle_unknown(match.group(1), match.group(2), match.group(3))
1017           return handler(match.group(2), match.group(3))
1018         except TracingFailure, e:
1019           # Hack in the values since the handler could be a static function.
1020           e.pid = self.pid
1021           e.line = line
1022           e.line_number = self._line_number
1023           # Re-raise the modified exception.
1024           raise
1025         except (KeyError, NotImplementedError, ValueError), e:
1026           raise TracingFailure(
1027               'Trace generated a %s exception: %s' % (
1028                   e.__class__.__name__, str(e)),
1029               self.pid,
1030               self._line_number,
1031               line,
1032               e)
1033
1034       @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1035       def handle_access(self, args, _result):
1036         self._handle_file(args[0], Results.File.TOUCHED)
1037
1038       @parse_args(r'^\"(.+?)\"$', True)
1039       def handle_chdir(self, args, _result):
1040         """Updates cwd."""
1041         self.cwd = self._mangle(args[0])
1042         logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1043
1044       @parse_args(r'^\"(.+?)\", (\d+), (\d+)$', False)
1045       def handle_chown(self, args, _result):
1046         # TODO(maruel): Look at result?
1047         self._handle_file(args[0], Results.File.WRITE)
1048
1049       def handle_clone(self, _args, result):
1050         self._handling_forking('clone', result)
1051
1052       def handle_close(self, _args, _result):
1053         pass
1054
1055       @parse_args(r'^\"(.+?)\", (\d+)$', False)
1056       def handle_chmod(self, args, _result):
1057         self._handle_file(args[0], Results.File.WRITE)
1058
1059       @parse_args(r'^\"(.+?)\", (\d+)$', False)
1060       def handle_creat(self, args, _result):
1061         self._handle_file(args[0], Results.File.WRITE)
1062
1063       @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1064       def handle_execve(self, args, _result):
1065         # Even if in practice execve() doesn't returns when it succeeds, strace
1066         # still prints '0' as the result.
1067         filepath = args[0]
1068         self._handle_file(filepath, Results.File.READ)
1069         self.executable = self._mangle(filepath)
1070         try:
1071           self.command = strace_process_quoted_arguments(args[1])
1072         except ValueError as e:
1073           raise TracingFailure(
1074               'Failed to process command line argument:\n%s' % e.args[0],
1075               None, None, None)
1076
1077       def handle_exit_group(self, _args, _result):
1078         """Removes cwd."""
1079         self.cwd = None
1080
1081       @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1082       def handle_faccessat(self, args, _results):
1083         if args[0] == 'AT_FDCWD':
1084           self._handle_file(args[1], Results.File.TOUCHED)
1085         else:
1086           raise Exception('Relative faccess not implemented.')
1087
1088       def handle_fallocate(self, _args, result):
1089         pass
1090
1091       def handle_fork(self, args, result):
1092         self._handle_unknown('fork', args, result)
1093
1094       def handle_futex(self, _args, _result):
1095         pass
1096
1097       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), (\d+)$', False)
1098       def handle_getcwd(self, args, _result):
1099         # TODO(maruel): Resolve file handle.
1100         if not args[0].startswith('0x'):
1101           filepath = args[0][1:-1]
1102           if os.path.isabs(filepath):
1103             logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1104             if not isinstance(self.cwd, unicode):
1105               # Take the occasion to reset the path.
1106               self.cwd = self._mangle(filepath)
1107             else:
1108               # It should always match.
1109               assert self.cwd == Strace.load_filename(filepath), (
1110                   self.cwd, filepath)
1111
1112       @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1113       def handle_link(self, args, _result):
1114         self._handle_file(args[0], Results.File.READ)
1115         self._handle_file(args[1], Results.File.WRITE)
1116
1117       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{(?:.+?, )?\.\.\.\}$', True)
1118       def handle_lstat(self, args, _result):
1119         # TODO(maruel): Resolve file handle.
1120         if not args[0].startswith('0x'):
1121           self._handle_file(args[0][1:-1], Results.File.TOUCHED)
1122
1123       def handle_mkdir(self, _args, _result):
1124         # We track content, not directories.
1125         pass
1126
1127       @parse_args(r'^(\".*?\"|0x[a-f0-9]+), ([A-Z\_\|]+)(|, \d+)$', False)
1128       def handle_open(self, args, _result):
1129         if 'O_DIRECTORY' in args[1]:
1130           return
1131         # TODO(maruel): Resolve file handle.
1132         if not args[0].startswith('0x'):
1133           t = Results.File.READ if 'O_RDONLY' in args[1] else Results.File.WRITE
1134           self._handle_file(args[0][1:-1], t)
1135
1136       @parse_args(
1137           r'^(\d+|AT_FDCWD), (\".*?\"|0x[a-f0-9]+), ([A-Z\_\|]+)(|, \d+)$',
1138           False)
1139       def handle_openat(self, args, _result):
1140         if 'O_DIRECTORY' in args[2]:
1141           return
1142         if args[0] == 'AT_FDCWD':
1143           # TODO(maruel): Resolve file handle.
1144           if not args[1].startswith('0x'):
1145             t = (
1146                 Results.File.READ if 'O_RDONLY' in args[2]
1147                 else Results.File.WRITE)
1148           self._handle_file(args[1][1:-1], t)
1149         else:
1150           # TODO(maruel): Implement relative open if necessary instead of the
1151           # AT_FDCWD flag, let's hope not since this means tracking all active
1152           # directory handles.
1153           raise NotImplementedError('Relative open via openat not implemented.')
1154
1155       @parse_args(
1156           r'^(\".+?\"|0x[a-f0-9]+), (?:\".+?\"(?:\.\.\.)?|0x[a-f0-9]+), '
1157             '\d+$',
1158           False)
1159       def handle_readlink(self, args, _result):
1160         # TODO(maruel): Resolve file handle.
1161         if not args[0].startswith('0x'):
1162           self._handle_file(args[0][1:-1], Results.File.READ)
1163
1164       @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1165       def handle_rename(self, args, _result):
1166         self._handle_file(args[0], Results.File.READ)
1167         self._handle_file(args[1], Results.File.WRITE)
1168
1169       def handle_rmdir(self, _args, _result):
1170         # We track content, not directories.
1171         pass
1172
1173       def handle_setxattr(self, _args, _result):
1174         # TODO(maruel): self._handle_file(args[0], Results.File.WRITE)
1175         pass
1176
1177       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{.+?\}$', True)
1178       def handle_stat(self, args, _result):
1179         # TODO(maruel): Resolve file handle.
1180         if not args[0].startswith('0x'):
1181           self._handle_file(args[0][1:-1], Results.File.TOUCHED)
1182
1183       @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1184       def handle_symlink(self, args, _result):
1185         self._handle_file(args[0], Results.File.TOUCHED)
1186         self._handle_file(args[1], Results.File.WRITE)
1187
1188       @parse_args(r'^\"(.+?)\", \d+', True)
1189       def handle_truncate(self, args, _result):
1190         self._handle_file(args[0], Results.File.WRITE)
1191
1192       def handle_unlink(self, _args, _result):
1193         # In theory, the file had to be created anyway.
1194         pass
1195
1196       def handle_unlinkat(self, _args, _result):
1197         # In theory, the file had to be created anyway.
1198         pass
1199
1200       def handle_statfs(self, _args, _result):
1201         pass
1202
1203       def handle_utimensat(self, _args, _result):
1204         pass
1205
1206       def handle_vfork(self, _args, result):
1207         self._handling_forking('vfork', result)
1208
1209       @staticmethod
1210       def _handle_unknown(function, args, result):
1211         raise TracingFailure(
1212             'Unexpected/unimplemented trace %s(%s)= %s' %
1213             (function, args, result),
1214             None, None, None)
1215
1216       def _handling_forking(self, name, result):
1217         """Transfers cwd."""
1218         if result.startswith(('?', '-1')):
1219           # The call failed.
1220           return
1221         # Update the other process right away.
1222         childpid = int(result)
1223         child = self._root().get_or_set_proc(childpid)
1224         if child.parentid is not None or childpid in self.children:
1225           raise TracingFailure(
1226               'Found internal inconsitency in process lifetime detection '
1227               'during a %s() call' % name,
1228               None, None, None)
1229
1230         # Copy the cwd object.
1231         child.initial_cwd = self.get_cwd()
1232         child.parentid = self.pid
1233         # It is necessary because the logs are processed out of order.
1234         self.children.append(child)
1235
1236       def _handle_file(self, filepath, mode):
1237         filepath = self._mangle(filepath)
1238         self.add_file(filepath, mode)
1239
1240       def _mangle(self, filepath):
1241         """Decodes a filepath found in the log and convert it to a late-bound
1242         path if necessary.
1243
1244         |filepath| is an strace 'encoded' string and the returned value is
1245         either an unicode string if the path was absolute or a late bound path
1246         otherwise.
1247         """
1248         filepath = Strace.load_filename(filepath)
1249         if os.path.isabs(filepath):
1250           return filepath
1251         else:
1252           if isinstance(self.get_cwd(), unicode):
1253             return os.path.normpath(os.path.join(self.get_cwd(), filepath))
1254           return self.RelativePath(self.get_cwd(), filepath)
1255
1256     def __init__(self, blacklist, root_pid, initial_cwd):
1257       """|root_pid| may be None when the root process is not known.
1258
1259       In that case, a search is done after reading all the logs to figure out
1260       the root process.
1261       """
1262       super(Strace.Context, self).__init__(blacklist)
1263       assert_is_renderable(initial_cwd)
1264       self.root_pid = root_pid
1265       self.initial_cwd = initial_cwd
1266
1267     def render(self):
1268       """Returns the string value of the initial cwd of the root process.
1269
1270       Used by RelativePath.
1271       """
1272       return self.initial_cwd
1273
1274     def on_line(self, pid, line):
1275       """Transfers control into the Process.on_line() function."""
1276       self.get_or_set_proc(pid).on_line(line.strip())
1277
1278     def to_results(self):
1279       """If necessary, finds back the root process and verify consistency."""
1280       if not self.root_pid:
1281         # The non-sudo case. The traced process was started by strace itself,
1282         # so the pid of the traced process is not known.
1283         root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1284         if len(root) == 1:
1285           self.root_process = root[0]
1286           # Save it for later.
1287           self.root_pid = self.root_process.pid
1288       else:
1289         # The sudo case. The traced process was started manually so its pid is
1290         # known.
1291         self.root_process = self._process_lookup.get(self.root_pid)
1292       if not self.root_process:
1293         raise TracingFailure(
1294             'Found internal inconsitency in process lifetime detection '
1295             'while finding the root process',
1296             None,
1297             None,
1298             None,
1299             self.root_pid,
1300             sorted(self._process_lookup))
1301       process = self.root_process.to_results_process()
1302       if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1303         raise TracingFailure(
1304             'Found internal inconsitency in process lifetime detection '
1305             'while looking for len(tree) == len(list)',
1306             None,
1307             None,
1308             None,
1309             sorted(self._process_lookup),
1310             sorted(p.pid for p in process.all))
1311       return Results(process)
1312
1313     def get_or_set_proc(self, pid):
1314       """Returns the Context.Process instance for this pid or creates a new one.
1315       """
1316       if not pid or not isinstance(pid, int):
1317         raise TracingFailure(
1318             'Unpexpected value for pid: %r' % pid,
1319             pid,
1320             None,
1321             None,
1322             pid)
1323       if pid not in self._process_lookup:
1324         self._process_lookup[pid] = self.Process(self, pid)
1325       return self._process_lookup[pid]
1326
1327     @classmethod
1328     def traces(cls):
1329       """Returns the list of all handled traces to pass this as an argument to
1330       strace.
1331       """
1332       prefix = 'handle_'
1333       return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1334
1335   class Tracer(ApiBase.Tracer):
1336     MAX_LEN = 256
1337
1338     def __init__(self, logname, use_sudo):
1339       super(Strace.Tracer, self).__init__(logname)
1340       self.use_sudo = use_sudo
1341       if use_sudo:
1342         # TODO(maruel): Use the jump script systematically to make it easy to
1343         # figure out the root process, so RelativePath is not necessary anymore.
1344         self._child_script = create_exec_thunk()
1345         self._scripts_to_cleanup.append(self._child_script)
1346
1347     def trace(self, cmd, cwd, tracename, output):
1348       """Runs strace on an executable.
1349
1350       When use_sudo=True, it is a 3-phases process: start the thunk, start
1351       sudo strace with the pid of the thunk and then have the thunk os.execve()
1352       the process to trace.
1353       """
1354       logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1355       assert os.path.isabs(cmd[0]), cmd[0]
1356       assert os.path.isabs(cwd), cwd
1357       assert os.path.normpath(cwd) == cwd, cwd
1358       with self._lock:
1359         if not self._initialized:
1360           raise TracingFailure(
1361               'Called Tracer.trace() on an unitialized object',
1362               None, None, None, tracename)
1363         assert tracename not in (i['trace'] for i in self._traces)
1364       stdout = stderr = None
1365       if output:
1366         stdout = subprocess.PIPE
1367         stderr = subprocess.STDOUT
1368
1369       # Ensure all file related APIs are hooked.
1370       traces = ','.join(Strace.Context.traces() + ['file'])
1371       flags = [
1372         # Each child process has its own trace file. It is necessary because
1373         # strace may generate corrupted log file if multiple processes are
1374         # heavily doing syscalls simultaneously.
1375         '-ff',
1376         # Reduce whitespace usage.
1377         '-a1',
1378         # hex encode non-ascii strings.
1379         # TODO(maruel): '-x',
1380         # TODO(maruel): '-ttt',
1381         # Signals are unnecessary noise here. Note the parser can cope with them
1382         # but reduce the unnecessary output.
1383         '-esignal=none',
1384         # Print as much data as wanted.
1385         '-s', '%d' % self.MAX_LEN,
1386         '-e', 'trace=%s' % traces,
1387         '-o', self._logname + '.' + tracename,
1388       ]
1389
1390       if self.use_sudo:
1391         pipe_r, pipe_w = os.pipe()
1392         # Start the child process paused.
1393         target_cmd = [sys.executable, self._child_script, str(pipe_r)] + cmd
1394         logging.debug(' '.join(target_cmd))
1395         child_proc = subprocess.Popen(
1396             target_cmd,
1397             stdin=subprocess.PIPE,
1398             stdout=stdout,
1399             stderr=stderr,
1400             cwd=cwd)
1401
1402         # TODO(maruel): both processes must use the same UID for it to work
1403         # without sudo. Look why -p is failing at the moment without sudo.
1404         trace_cmd = [
1405           'sudo',
1406           'strace',
1407           '-p', str(child_proc.pid),
1408         ] + flags
1409         logging.debug(' '.join(trace_cmd))
1410         strace_proc = subprocess.Popen(
1411             trace_cmd,
1412             cwd=cwd,
1413             stdin=subprocess.PIPE,
1414             stdout=subprocess.PIPE,
1415             stderr=subprocess.PIPE)
1416
1417         line = strace_proc.stderr.readline()
1418         if not re.match(r'^Process \d+ attached \- interrupt to quit$', line):
1419           # TODO(maruel): Raise an exception.
1420           assert False, line
1421
1422         # Now fire the child process.
1423         os.write(pipe_w, 'x')
1424
1425         out = child_proc.communicate()[0]
1426         strace_out = strace_proc.communicate()[0]
1427
1428         # TODO(maruel): if strace_proc.returncode: Add an exception.
1429         saved_out = strace_out if strace_proc.returncode else out
1430         root_pid = child_proc.pid
1431       else:
1432         # Non-sudo case.
1433         trace_cmd = [
1434           'strace',
1435         ] + flags + cmd
1436         logging.debug(' '.join(trace_cmd))
1437         child_proc = subprocess.Popen(
1438             trace_cmd,
1439             cwd=cwd,
1440             stdin=subprocess.PIPE,
1441             stdout=stdout,
1442             stderr=stderr)
1443         out = child_proc.communicate()[0]
1444         # TODO(maruel): Walk the logs and figure out the root process would
1445         # simplify parsing the logs a *lot*.
1446         saved_out = out
1447         # The trace reader will have to figure out.
1448         root_pid = None
1449
1450       with self._lock:
1451         assert tracename not in (i['trace'] for i in self._traces)
1452         self._traces.append(
1453           {
1454             'cmd': cmd,
1455             'cwd': cwd,
1456             'output': saved_out,
1457             'pid': root_pid,
1458             'trace': tracename,
1459           })
1460       return child_proc.returncode, out
1461
1462   def __init__(self, use_sudo=None):
1463     super(Strace, self).__init__()
1464     self.use_sudo = use_sudo
1465
1466   def get_tracer(self, logname):
1467     return self.Tracer(logname, self.use_sudo)
1468
1469   @staticmethod
1470   def clean_trace(logname):
1471     if os.path.isfile(logname):
1472       os.remove(logname)
1473     # Also delete any pid specific file from previous traces.
1474     for i in glob.iglob(logname + '.*'):
1475       if i.rsplit('.', 1)[1].isdigit():
1476         os.remove(i)
1477
1478   @classmethod
1479   def parse_log(cls, logname, blacklist, trace_name):
1480     logging.info('parse_log(%s, ..., %s)', logname, trace_name)
1481     assert os.path.isabs(logname)
1482     data = tools.read_json(logname)
1483     out = []
1484     for item in data['traces']:
1485       if trace_name and item['trace'] != trace_name:
1486         continue
1487       result = {
1488         'output': item['output'],
1489         'trace': item['trace'],
1490       }
1491       try:
1492         context = cls.Context(blacklist, item['pid'], item['cwd'])
1493         for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1494           logging.debug('Reading %s', pidfile)
1495           pid = pidfile.rsplit('.', 1)[1]
1496           if pid.isdigit():
1497             pid = int(pid)
1498             found_line = False
1499             for line in open(pidfile, 'rb'):
1500               context.on_line(pid, line)
1501               found_line = True
1502             if not found_line:
1503               # Ensures that a completely empty trace still creates the
1504               # corresponding Process instance by logging a dummy line.
1505               context.on_line(pid, '')
1506           else:
1507             logging.warning('Found unexpected file %s', pidfile)
1508         result['results'] = context.to_results()
1509       except TracingFailure:
1510         result['exception'] = sys.exc_info()
1511       out.append(result)
1512     return out
1513
1514
1515 class Dtrace(ApiBase):
1516   """Uses DTrace framework through dtrace. Requires root access.
1517
1518   Implies Mac OSX.
1519
1520   dtruss can't be used because it has compatibility issues with python.
1521
1522   Also, the pid->cwd handling needs to be done manually since OSX has no way to
1523   get the absolute path of the 'cwd' dtrace variable from the probe.
1524
1525   Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1526   https://discussions.apple.com/thread/1980539. So resort to handling execve()
1527   manually.
1528
1529   errno is not printed in the log since this implementation currently only cares
1530   about files that were successfully opened.
1531   """
1532   class Context(ApiBase.Context):
1533     # Format: index pid function(args)
1534     RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1535
1536     # Arguments parsing.
1537     RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1538     RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1539     RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1540     RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1541     RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1542     RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1543
1544     # O_DIRECTORY is not defined on Windows and dtrace doesn't exist on Windows.
1545     O_DIRECTORY = os.O_DIRECTORY if hasattr(os, 'O_DIRECTORY') else None
1546     O_RDWR = os.O_RDWR
1547     O_WRONLY = os.O_WRONLY
1548
1549     class Process(ApiBase.Context.Process):
1550       def __init__(self, *args):
1551         super(Dtrace.Context.Process, self).__init__(*args)
1552         self.cwd = self.initial_cwd
1553
1554     def __init__(self, blacklist, thunk_pid, initial_cwd):
1555       logging.info(
1556           '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
1557       super(Dtrace.Context, self).__init__(blacklist)
1558       assert isinstance(initial_cwd, unicode), initial_cwd
1559       # Process ID of the temporary script created by create_subprocess_thunk().
1560       self._thunk_pid = thunk_pid
1561       self._initial_cwd = initial_cwd
1562       self._line_number = 0
1563
1564     def on_line(self, line):
1565       assert isinstance(line, unicode), line
1566       self._line_number += 1
1567       match = self.RE_HEADER.match(line)
1568       if not match:
1569         raise TracingFailure(
1570             'Found malformed line: %s' % line,
1571             None,
1572             self._line_number,
1573             line)
1574       fn = getattr(
1575           self,
1576           'handle_%s' % match.group(2).replace('-', '_'),
1577           self._handle_ignored)
1578       # It is guaranteed to succeed because of the regexp. Or at least I thought
1579       # it would.
1580       pid = int(match.group(1))
1581       try:
1582         return fn(pid, match.group(3))
1583       except TracingFailure, e:
1584         # Hack in the values since the handler could be a static function.
1585         e.pid = pid
1586         e.line = line
1587         e.line_number = self._line_number
1588         # Re-raise the modified exception.
1589         raise
1590       except (KeyError, NotImplementedError, ValueError), e:
1591         raise TracingFailure(
1592             'Trace generated a %s exception: %s' % (
1593                 e.__class__.__name__, str(e)),
1594             pid,
1595             self._line_number,
1596             line,
1597             e)
1598
1599     def to_results(self):
1600       process = self.root_process.to_results_process()
1601       # Internal concistency check.
1602       if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1603         raise TracingFailure(
1604             'Found internal inconsitency in process lifetime detection '
1605             'while looking for len(tree) == len(list)',
1606             None,
1607             None,
1608             None,
1609             sorted(self._process_lookup),
1610             sorted(p.pid for p in process.all))
1611       return Results(process)
1612
1613     def handle_dtrace_BEGIN(self, _pid, args):
1614       if not self.RE_DTRACE_BEGIN.match(args):
1615         raise TracingFailure(
1616             'Found internal inconsitency in dtrace_BEGIN log line',
1617             None, None, None)
1618
1619     def handle_proc_start(self, pid, args):
1620       """Transfers cwd.
1621
1622       The dtrace script already takes care of only tracing the processes that
1623       are child of the traced processes so there is no need to verify the
1624       process hierarchy.
1625       """
1626       if pid in self._process_lookup:
1627         raise TracingFailure(
1628             'Found internal inconsitency in proc_start: %d started two times' %
1629                 pid,
1630             None, None, None)
1631       match = self.RE_PROC_START.match(args)
1632       if not match:
1633         raise TracingFailure(
1634             'Failed to parse arguments: %s' % args,
1635             None, None, None)
1636       ppid = int(match.group(1))
1637       if ppid == self._thunk_pid and not self.root_process:
1638         proc = self.root_process = self.Process(
1639             self.blacklist, pid, self._initial_cwd)
1640       elif ppid in self._process_lookup:
1641         proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1642         self._process_lookup[ppid].children.append(proc)
1643       else:
1644         # Another process tree, ignore.
1645         return
1646       self._process_lookup[pid] = proc
1647       logging.debug(
1648           'New child: %s -> %d  cwd:%s' %
1649           (ppid, pid, render(proc.initial_cwd)))
1650
1651     def handle_proc_exit(self, pid, _args):
1652       """Removes cwd."""
1653       if pid in self._process_lookup:
1654         # self._thunk_pid is not traced itself and other traces run neither.
1655         self._process_lookup[pid].cwd = None
1656
1657     def handle_execve(self, pid, args):
1658       """Sets the process' executable.
1659
1660       TODO(maruel): Read command line arguments.  See
1661       https://discussions.apple.com/thread/1980539 for an example.
1662       https://gist.github.com/1242279
1663
1664       Will have to put the answer at http://stackoverflow.com/questions/7556249.
1665       :)
1666       """
1667       if not pid in self._process_lookup:
1668         # Another process tree, ignore.
1669         return
1670       match = self.RE_EXECVE.match(args)
1671       if not match:
1672         raise TracingFailure(
1673             'Failed to parse arguments: %r' % args,
1674             None, None, None)
1675       proc = self._process_lookup[pid]
1676       proc.executable = match.group(1)
1677       self._handle_file(pid, proc.executable, Results.File.READ)
1678       proc.command = self.process_escaped_arguments(match.group(3))
1679       if int(match.group(2)) != len(proc.command):
1680         raise TracingFailure(
1681             'Failed to parse execve() arguments: %s' % args,
1682             None, None, None)
1683
1684     def handle_chdir(self, pid, args):
1685       """Updates cwd."""
1686       if pid not in self._process_lookup:
1687         # Another process tree, ignore.
1688         return
1689       cwd = self.RE_CHDIR.match(args).group(1)
1690       if not cwd.startswith('/'):
1691         cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1692         logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1693       else:
1694         logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1695         cwd2 = cwd
1696       self._process_lookup[pid].cwd = cwd2
1697
1698     def handle_open_nocancel(self, pid, args):
1699       """Redirects to handle_open()."""
1700       return self.handle_open(pid, args)
1701
1702     def handle_open(self, pid, args):
1703       if pid not in self._process_lookup:
1704         # Another process tree, ignore.
1705         return
1706       match = self.RE_OPEN.match(args)
1707       if not match:
1708         raise TracingFailure(
1709             'Failed to parse arguments: %s' % args,
1710             None, None, None)
1711       flag = int(match.group(2), 16)
1712       if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1713         # Ignore directories.
1714         return
1715       self._handle_file(
1716           pid,
1717           match.group(1),
1718           Results.File.READ if not ((self.O_RDWR | self.O_WRONLY) & flag)
1719             else Results.File.WRITE)
1720
1721     def handle_rename(self, pid, args):
1722       if pid not in self._process_lookup:
1723         # Another process tree, ignore.
1724         return
1725       match = self.RE_RENAME.match(args)
1726       if not match:
1727         raise TracingFailure(
1728             'Failed to parse arguments: %s' % args,
1729             None, None, None)
1730       self._handle_file(pid, match.group(1), Results.File.READ)
1731       self._handle_file(pid, match.group(2), Results.File.WRITE)
1732
1733     def _handle_file(self, pid, filepath, mode):
1734       if not filepath.startswith('/'):
1735         filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1736       # We can get '..' in the path.
1737       filepath = os.path.normpath(filepath)
1738       # Sadly, still need to filter out directories here;
1739       # saw open_nocancel(".", 0, 0) = 0 lines.
1740       if os.path.isdir(filepath):
1741         return
1742       self._process_lookup[pid].add_file(filepath, mode)
1743
1744     def handle_ftruncate(self, pid, args):
1745       """Just used as a signal to kill dtrace, ignoring."""
1746       pass
1747
1748     @staticmethod
1749     def _handle_ignored(pid, args):
1750       """Is called for all the event traces that are not handled."""
1751       raise NotImplementedError('Please implement me')
1752
1753     @staticmethod
1754     def process_escaped_arguments(text):
1755       """Extracts escaped arguments on a string and return the arguments as a
1756       list.
1757
1758       Implemented as an automaton.
1759
1760       Example:
1761         With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1762         function will return ['python2.7', '-c', 'print("hi")]
1763       """
1764       if not text.endswith('\\0'):
1765         raise ValueError('String is not null terminated: %r' % text, text)
1766       text = text[:-2]
1767
1768       def unescape(x):
1769         """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1770         out = []
1771         escaped = False
1772         for i in x:
1773           if i == '\\' and not escaped:
1774             escaped = True
1775             continue
1776           escaped = False
1777           out.append(i)
1778         return ''.join(out)
1779
1780       return [unescape(i) for i in text.split('\\001')]
1781
1782   class Tracer(ApiBase.Tracer):
1783     # pylint: disable=C0301
1784     #
1785     # To understand the following code, you'll want to take a look at:
1786     # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1787     # https://wikis.oracle.com/display/DTrace/Variables
1788     # http://docs.oracle.com/cd/E19205-01/820-4221/
1789     #
1790     # 0. Dump all the valid probes into a text file. It is important, you
1791     #    want to redirect into a file and you don't want to constantly 'sudo'.
1792     # $ sudo dtrace -l > probes.txt
1793     #
1794     # 1. Count the number of probes:
1795     # $ wc -l probes.txt
1796     # 81823  # On OSX 10.7, including 1 header line.
1797     #
1798     # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1799     #    likes and skipping the header with NR>1:
1800     # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1801     # dtrace
1802     # fbt
1803     # io
1804     # ip
1805     # lockstat
1806     # mach_trap
1807     # proc
1808     # profile
1809     # sched
1810     # syscall
1811     # tcp
1812     # vminfo
1813     #
1814     # 3. List of valid probes:
1815     # $ grep syscall probes.txt | less
1816     #    or use dtrace directly:
1817     # $ sudo dtrace -l -P syscall | less
1818     #
1819     # trackedpid is an associative array where its value can be 0, 1 or 2.
1820     # 0 is for untracked processes and is the default value for items not
1821     #   in the associative array.
1822     # 1 is for tracked processes.
1823     # 2 is for the script created by create_subprocess_thunk() only. It is not
1824     #   tracked itself but all its decendants are.
1825     #
1826     # The script will kill itself only once waiting_to_die == 1 and
1827     # current_processes == 0, so that both getlogin() was called and that
1828     # all traced processes exited.
1829     #
1830     # TODO(maruel): Use cacheable predicates. See
1831     # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1832     D_CODE = """
1833       dtrace:::BEGIN {
1834         waiting_to_die = 0;
1835         current_processes = 0;
1836         logindex = 0;
1837         printf("%d %d %s_%s(\\"%s\\")\\n",
1838                logindex, PID, probeprov, probename, SCRIPT);
1839         logindex++;
1840       }
1841
1842       proc:::start /trackedpid[ppid]/ {
1843         trackedpid[pid] = 1;
1844         current_processes += 1;
1845         printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1846                logindex, pid, probeprov, probename,
1847                ppid,
1848                execname,
1849                current_processes);
1850         logindex++;
1851       }
1852       /* Should use SCRIPT but there is no access to this variable at that
1853        * point. */
1854       proc:::start /ppid == PID && execname == "Python"/ {
1855         trackedpid[pid] = 2;
1856         current_processes += 1;
1857         printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1858                logindex, pid, probeprov, probename,
1859                ppid,
1860                execname,
1861                current_processes);
1862         logindex++;
1863       }
1864       proc:::exit /trackedpid[pid] &&
1865                   current_processes == 1 &&
1866                   waiting_to_die == 1/ {
1867         trackedpid[pid] = 0;
1868         current_processes -= 1;
1869         printf("%d %d %s_%s(%d)\\n",
1870                logindex, pid, probeprov, probename,
1871                current_processes);
1872         logindex++;
1873         exit(0);
1874       }
1875       proc:::exit /trackedpid[pid]/ {
1876         trackedpid[pid] = 0;
1877         current_processes -= 1;
1878         printf("%d %d %s_%s(%d)\\n",
1879                logindex, pid, probeprov, probename,
1880                current_processes);
1881         logindex++;
1882       }
1883
1884       /* Use an arcane function to detect when we need to die */
1885       syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
1886         waiting_to_die = 1;
1887         printf("%d %d %s()\\n", logindex, pid, probefunc);
1888         logindex++;
1889       }
1890       syscall::ftruncate:entry /
1891           pid == PID && arg0 == FILE_ID && current_processes == 0/ {
1892         exit(0);
1893       }
1894
1895       syscall::open*:entry /trackedpid[pid] == 1/ {
1896         self->open_arg0 = arg0;
1897         self->open_arg1 = arg1;
1898         self->open_arg2 = arg2;
1899       }
1900       syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
1901         this->open_arg0 = copyinstr(self->open_arg0);
1902         printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
1903                logindex, pid, probefunc,
1904                this->open_arg0,
1905                self->open_arg1,
1906                self->open_arg2);
1907         logindex++;
1908         this->open_arg0 = 0;
1909       }
1910       syscall::open*:return /trackedpid[pid] == 1/ {
1911         self->open_arg0 = 0;
1912         self->open_arg1 = 0;
1913         self->open_arg2 = 0;
1914       }
1915
1916       syscall::rename:entry /trackedpid[pid] == 1/ {
1917         self->rename_arg0 = arg0;
1918         self->rename_arg1 = arg1;
1919       }
1920       syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
1921         this->rename_arg0 = copyinstr(self->rename_arg0);
1922         this->rename_arg1 = copyinstr(self->rename_arg1);
1923         printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
1924                logindex, pid, probefunc,
1925                this->rename_arg0,
1926                this->rename_arg1);
1927         logindex++;
1928         this->rename_arg0 = 0;
1929         this->rename_arg1 = 0;
1930       }
1931       syscall::rename:return /trackedpid[pid] == 1/ {
1932         self->rename_arg0 = 0;
1933         self->rename_arg1 = 0;
1934       }
1935
1936       /* Track chdir, it's painful because it is only receiving relative path.
1937        */
1938       syscall::chdir:entry /trackedpid[pid] == 1/ {
1939         self->chdir_arg0 = arg0;
1940       }
1941       syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
1942         this->chdir_arg0 = copyinstr(self->chdir_arg0);
1943         printf("%d %d %s(\\"%s\\")\\n",
1944                logindex, pid, probefunc,
1945                this->chdir_arg0);
1946         logindex++;
1947         this->chdir_arg0 = 0;
1948       }
1949       syscall::chdir:return /trackedpid[pid] == 1/ {
1950         self->chdir_arg0 = 0;
1951       }
1952       """
1953
1954     # execve-specific code, tends to throw a lot of exceptions.
1955     D_CODE_EXECVE = """
1956       /* Finally what we care about! */
1957       syscall::exec*:entry /trackedpid[pid]/ {
1958         self->exec_arg0 = copyinstr(arg0);
1959         /* Incrementally probe for a NULL in the argv parameter of execve() to
1960          * figure out argc. */
1961         /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
1962          * found. */
1963         self->exec_argc = 0;
1964         /* Probe for argc==1 */
1965         this->exec_argv = (user_addr_t*)copyin(
1966              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1967         self->exec_argc = this->exec_argv[self->exec_argc] ?
1968             (self->exec_argc + 1) : self->exec_argc;
1969
1970         /* Probe for argc==2 */
1971         this->exec_argv = (user_addr_t*)copyin(
1972              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1973         self->exec_argc = this->exec_argv[self->exec_argc] ?
1974             (self->exec_argc + 1) : self->exec_argc;
1975
1976         /* Probe for argc==3 */
1977         this->exec_argv = (user_addr_t*)copyin(
1978              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1979         self->exec_argc = this->exec_argv[self->exec_argc] ?
1980             (self->exec_argc + 1) : self->exec_argc;
1981
1982         /* Probe for argc==4 */
1983         this->exec_argv = (user_addr_t*)copyin(
1984              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1985         self->exec_argc = this->exec_argv[self->exec_argc] ?
1986             (self->exec_argc + 1) : self->exec_argc;
1987
1988         /* Copy the inputs strings since there is no guarantee they'll be
1989          * present after the call completed. */
1990         self->exec_argv0 = (self->exec_argc > 0) ?
1991             copyinstr(this->exec_argv[0]) : "";
1992         self->exec_argv1 = (self->exec_argc > 1) ?
1993             copyinstr(this->exec_argv[1]) : "";
1994         self->exec_argv2 = (self->exec_argc > 2) ?
1995             copyinstr(this->exec_argv[2]) : "";
1996         self->exec_argv3 = (self->exec_argc > 3) ?
1997             copyinstr(this->exec_argv[3]) : "";
1998         this->exec_argv = 0;
1999       }
2000       syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2001         /* We need to join strings here, as using multiple printf() would
2002          * cause tearing when multiple threads/processes are traced.
2003          * Since it is impossible to escape a string and join it to another one,
2004          * like sprintf("%s%S", previous, more), use hackery.
2005          * Each of the elements are split with a \\1. \\0 cannot be used because
2006          * it is simply ignored. This will conflict with any program putting a
2007          * \\1 in their execve() string but this should be "rare enough" */
2008         this->args = "";
2009         /* Process exec_argv[0] */
2010         this->args = strjoin(
2011             this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2012
2013         /* Process exec_argv[1] */
2014         this->args = strjoin(
2015             this->args, (self->exec_argc > 1) ? "\\1" : "");
2016         this->args = strjoin(
2017             this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2018
2019         /* Process exec_argv[2] */
2020         this->args = strjoin(
2021             this->args, (self->exec_argc > 2) ? "\\1" : "");
2022         this->args = strjoin(
2023             this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2024
2025         /* Process exec_argv[3] */
2026         this->args = strjoin(
2027             this->args, (self->exec_argc > 3) ? "\\1" : "");
2028         this->args = strjoin(
2029             this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2030
2031         /* Prints self->exec_argc to permits verifying the internal
2032          * consistency since this code is quite fishy. */
2033         printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2034                logindex, pid, probefunc,
2035                self->exec_arg0,
2036                self->exec_argc,
2037                this->args);
2038         logindex++;
2039         this->args = 0;
2040       }
2041       syscall::exec*:return /trackedpid[pid]/ {
2042         self->exec_arg0 = 0;
2043         self->exec_argc = 0;
2044         self->exec_argv0 = 0;
2045         self->exec_argv1 = 0;
2046         self->exec_argv2 = 0;
2047         self->exec_argv3 = 0;
2048       }
2049       """
2050
2051     # Code currently not used.
2052     D_EXTRANEOUS = """
2053       /* This is a good learning experience, since it traces a lot of things
2054        * related to the process and child processes.
2055        * Warning: it generates a gigantic log. For example, tracing
2056        * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2057        * several minutes to execute.
2058        */
2059       /*
2060       mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2061         printf("%d %d %s_%s() = %d\\n",
2062                logindex, pid, probeprov, probefunc, errno);
2063         logindex++;
2064       }
2065       proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2066         printf("%d %d %s_%s() = %d\\n",
2067                logindex, pid, probeprov, probefunc, errno);
2068         logindex++;
2069       }
2070       sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2071         printf("%d %d %s_%s() = %d\\n",
2072                logindex, pid, probeprov, probefunc, errno);
2073         logindex++;
2074       }
2075       syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2076         printf("%d %d %s_%s() = %d\\n",
2077                logindex, pid, probeprov, probefunc, errno);
2078         logindex++;
2079       }
2080       vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2081         printf("%d %d %s_%s() = %d\\n",
2082                logindex, pid, probeprov, probefunc, errno);
2083         logindex++;
2084       }
2085       */
2086       /* TODO(maruel): *stat* functions and friends
2087         syscall::access:return,
2088         syscall::chdir:return,
2089         syscall::chflags:return,
2090         syscall::chown:return,
2091         syscall::chroot:return,
2092         syscall::getattrlist:return,
2093         syscall::getxattr:return,
2094         syscall::lchown:return,
2095         syscall::lstat64:return,
2096         syscall::lstat:return,
2097         syscall::mkdir:return,
2098         syscall::pathconf:return,
2099         syscall::readlink:return,
2100         syscall::removexattr:return,
2101         syscall::setxattr:return,
2102         syscall::stat64:return,
2103         syscall::stat:return,
2104         syscall::truncate:return,
2105         syscall::unlink:return,
2106         syscall::utimes:return,
2107       */
2108       """
2109
2110     def __init__(self, logname, use_sudo):
2111       """Starts the log collection with dtrace.
2112
2113       Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2114       this needs to wait for dtrace to be "warmed up".
2115       """
2116       super(Dtrace.Tracer, self).__init__(logname)
2117       # This script is used as a signal to figure out the root process.
2118       self._signal_script = create_subprocess_thunk()
2119       self._scripts_to_cleanup.append(self._signal_script)
2120       # This unique dummy temp file is used to signal the dtrace script that it
2121       # should stop as soon as all the child processes are done. A bit hackish
2122       # but works fine enough.
2123       self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2124           prefix='trace_signal_file')
2125
2126       dtrace_path = '/usr/sbin/dtrace'
2127       if not os.path.isfile(dtrace_path):
2128         dtrace_path = 'dtrace'
2129       elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID):
2130         # No need to sudo. For those following at home, don't do that.
2131         use_sudo = False
2132
2133       # Note: do not use the -p flag. It's useless if the initial process quits
2134       # too fast, resulting in missing traces from the grand-children. The D
2135       # code manages the dtrace lifetime itself.
2136       trace_cmd = [
2137         dtrace_path,
2138         # Use a larger buffer if getting 'out of scratch space' errors.
2139         # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2140         '-b', '10m',
2141         '-x', 'dynvarsize=10m',
2142         #'-x', 'dtrace_global_maxsize=1m',
2143         '-x', 'evaltime=exec',
2144         '-o', '/dev/stderr',
2145         '-q',
2146         '-n', self._get_dtrace_code(),
2147       ]
2148       if use_sudo is not False:
2149         trace_cmd.insert(0, 'sudo')
2150
2151       with open(self._logname + '.log', 'wb') as logfile:
2152         self._dtrace = subprocess.Popen(
2153             trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2154       logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2155
2156       # Reads until one line is printed, which signifies dtrace is up and ready.
2157       with open(self._logname + '.log', 'rb') as logfile:
2158         while 'dtrace_BEGIN' not in logfile.readline():
2159           if self._dtrace.poll() is not None:
2160             # Do a busy wait. :/
2161             break
2162       logging.debug('dtrace started')
2163
2164     def _get_dtrace_code(self):
2165       """Setups the D code to implement child process tracking.
2166
2167       Injects the cookie in the script so it knows when to stop.
2168
2169       The script will detect any instance of the script created with
2170       create_subprocess_thunk() and will start tracing it.
2171       """
2172       out = (
2173           'inline int PID = %d;\n'
2174           'inline string SCRIPT = "%s";\n'
2175           'inline int FILE_ID = %d;\n'
2176           '\n'
2177           '%s') % (
2178               os.getpid(),
2179               self._signal_script,
2180               self._dummy_file_id,
2181               self.D_CODE)
2182       if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
2183         # Do not enable by default since it tends to spew dtrace: error lines
2184         # because the execve() parameters are not in valid memory at the time of
2185         # logging.
2186         # TODO(maruel): Find a way to make this reliable since it's useful but
2187         # only works in limited/trivial uses cases for now.
2188         out += self.D_CODE_EXECVE
2189       return out
2190
2191     def trace(self, cmd, cwd, tracename, output):
2192       """Runs dtrace on an executable.
2193
2194       This dtruss is broken when it starts the process itself or when tracing
2195       child processes, this code starts a wrapper process
2196       generated with create_subprocess_thunk() which starts the executable to
2197       trace.
2198       """
2199       logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2200       assert os.path.isabs(cmd[0]), cmd[0]
2201       assert os.path.isabs(cwd), cwd
2202       assert os.path.normpath(cwd) == cwd, cwd
2203       with self._lock:
2204         if not self._initialized:
2205           raise TracingFailure(
2206               'Called Tracer.trace() on an unitialized object',
2207               None, None, None, tracename)
2208         assert tracename not in (i['trace'] for i in self._traces)
2209
2210       # Starts the script wrapper to start the child process. This signals the
2211       # dtrace script that this process is to be traced.
2212       stdout = stderr = None
2213       if output:
2214         stdout = subprocess.PIPE
2215         stderr = subprocess.STDOUT
2216       child_cmd = [
2217         sys.executable,
2218         self._signal_script,
2219         tracename,
2220       ]
2221       # Call a dummy function so that dtrace knows I'm about to launch a process
2222       # that needs to be traced.
2223       # Yummy.
2224       child = subprocess.Popen(
2225           child_cmd + tools.fix_python_path(cmd),
2226           stdin=subprocess.PIPE,
2227           stdout=stdout,
2228           stderr=stderr,
2229           cwd=cwd)
2230       logging.debug('Started child pid: %d' % child.pid)
2231
2232       out = child.communicate()[0]
2233       # This doesn't mean tracing is done, one of the grand-child process may
2234       # still be alive. It will be tracked with the dtrace script.
2235
2236       with self._lock:
2237         assert tracename not in (i['trace'] for i in self._traces)
2238         self._traces.append(
2239           {
2240             'cmd': cmd,
2241             'cwd': cwd,
2242             'pid': child.pid,
2243             'output': out,
2244             'trace': tracename,
2245           })
2246       return child.returncode, out
2247
2248     def close(self, timeout=None):
2249       """Terminates dtrace."""
2250       logging.debug('close(%s)' % timeout)
2251       try:
2252         try:
2253           super(Dtrace.Tracer, self).close(timeout)
2254           # Signal dtrace that it should stop now.
2255           # ftruncate doesn't exist on Windows.
2256           os.ftruncate(self._dummy_file_id, 0)  # pylint: disable=E1101
2257           if timeout:
2258             start = time.time()
2259             # Use polling. :/
2260             while (self._dtrace.poll() is None and
2261                    (time.time() - start) < timeout):
2262               time.sleep(0.1)
2263             self._dtrace.kill()
2264           self._dtrace.wait()
2265         finally:
2266           # Make sure to kill it in any case.
2267           if self._dtrace.poll() is None:
2268             try:
2269               self._dtrace.kill()
2270               self._dtrace.wait()
2271             except OSError:
2272               pass
2273
2274         if self._dtrace.returncode != 0:
2275           # Warn about any dtrace failure but basically ignore it.
2276           print 'dtrace failure: %s' % self._dtrace.returncode
2277       finally:
2278         os.close(self._dummy_file_id)
2279         os.remove(self._dummy_file_name)
2280
2281     def post_process_log(self):
2282       """Sorts the log back in order when each call occured.
2283
2284       dtrace doesn't save the buffer in strict order since it keeps one buffer
2285       per CPU.
2286       """
2287       super(Dtrace.Tracer, self).post_process_log()
2288       logname = self._logname + '.log'
2289       with open(logname, 'rb') as logfile:
2290         lines = [l for l in logfile if l.strip()]
2291       errors = [l for l in lines if l.startswith('dtrace:')]
2292       if errors:
2293         raise TracingFailure(
2294             'Found errors in the trace: %s' % '\n'.join(errors),
2295             None, None, None, logname)
2296       try:
2297         lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2298       except ValueError:
2299         raise TracingFailure(
2300             'Found errors in the trace: %s' % '\n'.join(
2301                 l for l in lines if l.split(' ', 1)[0].isdigit()),
2302             None, None, None, logname)
2303       with open(logname, 'wb') as logfile:
2304         logfile.write(''.join(lines))
2305
2306   def __init__(self, use_sudo=None):
2307     super(Dtrace, self).__init__()
2308     self.use_sudo = use_sudo
2309
2310   def get_tracer(self, logname):
2311     return self.Tracer(logname, self.use_sudo)
2312
2313   @staticmethod
2314   def clean_trace(logname):
2315     for ext in ('', '.log'):
2316       if os.path.isfile(logname + ext):
2317         os.remove(logname + ext)
2318
2319   @classmethod
2320   def parse_log(cls, logname, blacklist, trace_name):
2321     logging.info('parse_log(%s, ..., %s)', logname, trace_name)
2322     assert os.path.isabs(logname)
2323
2324     def blacklist_more(filepath):
2325       # All the HFS metadata is in the form /.vol/...
2326       return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2327
2328     data = tools.read_json(logname)
2329     out = []
2330     for item in data['traces']:
2331       if trace_name and item['trace'] != trace_name:
2332         continue
2333       result = {
2334         'output': item['output'],
2335         'trace': item['trace'],
2336       }
2337       try:
2338         context = cls.Context(blacklist_more, item['pid'], item['cwd'])
2339         # It's fine to assume the file as UTF-8: OSX enforces the file names to
2340         # be valid UTF-8 and we control the log output.
2341         for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
2342           context.on_line(line)
2343         result['results'] = context.to_results()
2344       except TracingFailure:
2345         result['exception'] = sys.exc_info()
2346       out.append(result)
2347     return out
2348
2349
2350 class LogmanTrace(ApiBase):
2351   """Uses the native Windows ETW based tracing functionality to trace a child
2352   process.
2353
2354   Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2355   the Windows Kernel doesn't have a concept of 'current working directory' at
2356   all. A Win32 process has a map of current directories, one per drive letter
2357   and it is managed by the user mode kernel32.dll. In kernel, a file is always
2358   opened relative to another file_object or as an absolute path. All the current
2359   working directory logic is done in user mode.
2360   """
2361   class Context(ApiBase.Context):
2362     """Processes a ETW log line and keeps the list of existent and non
2363     existent files accessed.
2364
2365     Ignores directories.
2366     """
2367     # These indexes are for the stripped version in json.
2368     EVENT_NAME = 0
2369     TYPE = 1
2370     PID = 2
2371     TID = 3
2372     PROCESSOR_ID = 4
2373     TIMESTAMP = 5
2374     USER_DATA = 6
2375
2376     class Process(ApiBase.Context.Process):
2377       def __init__(self, *args):
2378         super(LogmanTrace.Context.Process, self).__init__(*args)
2379         # Handle file objects that succeeded.
2380         self.file_objects = {}
2381
2382     def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2383       logging.info(
2384           '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2385           thunk_cmd)
2386       super(LogmanTrace.Context, self).__init__(blacklist)
2387       self._drive_map = file_path.DosDriveMap()
2388       # Threads mapping to the corresponding process id.
2389       self._threads_active = {}
2390       # Process ID of the tracer, e.g. the temporary script created by
2391       # create_subprocess_thunk(). This is tricky because the process id may
2392       # have been reused.
2393       self._thunk_pid = thunk_pid
2394       self._thunk_cmd = thunk_cmd
2395       self._trace_name = trace_name
2396       self._line_number = 0
2397       self._thunk_process = None
2398
2399     def on_line(self, line):
2400       """Processes a json Event line."""
2401       self._line_number += 1
2402       try:
2403         # By Opcode
2404         handler = getattr(
2405             self,
2406             'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2407             None)
2408         if not handler:
2409           raise TracingFailure(
2410               'Unexpected event %s_%s' % (
2411                   line[self.EVENT_NAME], line[self.TYPE]),
2412               None, None, None)
2413         handler(line)
2414       except TracingFailure, e:
2415         # Hack in the values since the handler could be a static function.
2416         e.pid = line[self.PID]
2417         e.line = line
2418         e.line_number = self._line_number
2419         # Re-raise the modified exception.
2420         raise
2421       except (KeyError, NotImplementedError, ValueError), e:
2422         raise TracingFailure(
2423             'Trace generated a %s exception: %s' % (
2424                 e.__class__.__name__, str(e)),
2425             line[self.PID],
2426             self._line_number,
2427             line,
2428             e)
2429
2430     def to_results(self):
2431       if not self.root_process:
2432         raise TracingFailure(
2433             'Failed to detect the initial process %d' % self._thunk_pid,
2434             None, None, None)
2435       process = self.root_process.to_results_process()
2436       return Results(process)
2437
2438     def _thread_to_process(self, tid):
2439       """Finds the process from the thread id."""
2440       tid = int(tid, 16)
2441       pid = self._threads_active.get(tid)
2442       if not pid or not self._process_lookup.get(pid):
2443         return
2444       return self._process_lookup[pid]
2445
2446     @classmethod
2447     def handle_EventTrace_Header(cls, line):
2448       """Verifies no event was dropped, e.g. no buffer overrun occured."""
2449       BUFFER_SIZE = cls.USER_DATA
2450       #VERSION = cls.USER_DATA + 1
2451       #PROVIDER_VERSION = cls.USER_DATA + 2
2452       #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2453       #END_TIME = cls.USER_DATA + 4
2454       #TIMER_RESOLUTION = cls.USER_DATA + 5
2455       #MAX_FILE_SIZE = cls.USER_DATA + 6
2456       #LOG_FILE_MODE = cls.USER_DATA + 7
2457       #BUFFERS_WRITTEN = cls.USER_DATA + 8
2458       #START_BUFFERS = cls.USER_DATA + 9
2459       #POINTER_SIZE = cls.USER_DATA + 10
2460       EVENTS_LOST = cls.USER_DATA + 11
2461       #CPU_SPEED = cls.USER_DATA + 12
2462       #LOGGER_NAME = cls.USER_DATA + 13
2463       #LOG_FILE_NAME = cls.USER_DATA + 14
2464       #BOOT_TIME = cls.USER_DATA + 15
2465       #PERF_FREQ = cls.USER_DATA + 16
2466       #START_TIME = cls.USER_DATA + 17
2467       #RESERVED_FLAGS = cls.USER_DATA + 18
2468       #BUFFERS_LOST = cls.USER_DATA + 19
2469       #SESSION_NAME_STRING = cls.USER_DATA + 20
2470       #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2471       if line[EVENTS_LOST] != '0':
2472         raise TracingFailure(
2473             ( '%s events were lost during trace, please increase the buffer '
2474               'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2475             None, None, None)
2476
2477     def handle_FileIo_Cleanup(self, line):
2478       """General wisdom: if a file is closed, it's because it was opened.
2479
2480       Note that FileIo_Close is not used since if a file was opened properly but
2481       not closed before the process exits, only Cleanup will be logged.
2482       """
2483       #IRP = self.USER_DATA
2484       TTID = self.USER_DATA + 1  # Thread ID, that's what we want.
2485       FILE_OBJECT = self.USER_DATA + 2
2486       #FILE_KEY = self.USER_DATA + 3
2487       proc = self._thread_to_process(line[TTID])
2488       if not proc:
2489         # Not a process we care about.
2490         return
2491       file_object = line[FILE_OBJECT]
2492       if file_object in proc.file_objects:
2493         filepath, access_type = proc.file_objects.pop(file_object)
2494         proc.add_file(filepath, access_type)
2495
2496     def handle_FileIo_Create(self, line):
2497       """Handles a file open.
2498
2499       All FileIo events are described at
2500       http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2501       for some value of 'description'.
2502
2503       " (..) process and thread id values of the IO events (..) are not valid "
2504       http://msdn.microsoft.com/magazine/ee358703.aspx
2505
2506       The FileIo.Create event doesn't return if the CreateFile() call
2507       succeeded, so keep track of the file_object and check that it is
2508       eventually closed with FileIo_Cleanup.
2509       """
2510       #IRP = self.USER_DATA
2511       TTID = self.USER_DATA + 1  # Thread ID, that's what we want.
2512       FILE_OBJECT = self.USER_DATA + 2
2513       #CREATE_OPTIONS = self.USER_DATA + 3
2514       #FILE_ATTRIBUTES = self.USER_DATA + 4
2515       #SHARE_ACCESS = self.USER_DATA + 5
2516       OPEN_PATH = self.USER_DATA + 6
2517
2518       proc = self._thread_to_process(line[TTID])
2519       if not proc:
2520         # Not a process we care about.
2521         return
2522
2523       raw_path = line[OPEN_PATH]
2524       # Ignore directories and bare drive right away.
2525       if raw_path.endswith(os.path.sep):
2526         return
2527       filepath = self._drive_map.to_win32(raw_path)
2528       # Ignore bare drive right away. Some may still fall through with format
2529       # like '\\?\X:'
2530       if len(filepath) == 2:
2531         return
2532       file_object = line[FILE_OBJECT]
2533       if os.path.isdir(filepath):
2534         # There is no O_DIRECTORY equivalent on Windows. The closed is
2535         # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2536         # simply discard directories are they are found.
2537         return
2538       # Override any stale file object.
2539       # TODO(maruel): Figure out a way to detect if the file was opened for
2540       # reading or writting. Sadly CREATE_OPTIONS doesn't seem to be of any help
2541       # here. For now mark as None to make it clear we have no idea what it is
2542       # about.
2543       proc.file_objects[file_object] = (filepath, None)
2544
2545     def handle_FileIo_Rename(self, line):
2546       # TODO(maruel): Handle?
2547       pass
2548
2549     def handle_Process_End(self, line):
2550       pid = line[self.PID]
2551       if self._process_lookup.get(pid):
2552         logging.info('Terminated: %d' % pid)
2553         self._process_lookup[pid] = None
2554       else:
2555         logging.debug('Terminated: %d' % pid)
2556       if self._thunk_process and self._thunk_process.pid == pid:
2557         self._thunk_process = None
2558
2559     def handle_Process_Start(self, line):
2560       """Handles a new child process started by PID."""
2561       #UNIQUE_PROCESS_KEY = self.USER_DATA
2562       PROCESS_ID = self.USER_DATA + 1
2563       #PARENT_PID = self.USER_DATA + 2
2564       #SESSION_ID = self.USER_DATA + 3
2565       #EXIT_STATUS = self.USER_DATA + 4
2566       #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2567       #USER_SID = self.USER_DATA + 6
2568       IMAGE_FILE_NAME = self.USER_DATA + 7
2569       COMMAND_LINE = self.USER_DATA + 8
2570
2571       ppid = line[self.PID]
2572       pid = int(line[PROCESS_ID], 16)
2573       command_line = CommandLineToArgvW(line[COMMAND_LINE])
2574       logging.debug(
2575           'New process %d->%d (%s) %s' %
2576             (ppid, pid, line[IMAGE_FILE_NAME], command_line))
2577
2578       if pid == self._thunk_pid:
2579         # Need to ignore processes we don't know about because the log is
2580         # system-wide. self._thunk_pid shall start only one process.
2581         # This is tricky though because Windows *loves* to reuse process id and
2582         # it happens often that the process ID of the thunk script created by
2583         # create_subprocess_thunk() is reused. So just detecting the pid here is
2584         # not sufficient, we must confirm the command line.
2585         if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2586           logging.info(
2587               'Ignoring duplicate pid %d for %s: %s while searching for %s',
2588               pid, self._trace_name, command_line, self._thunk_cmd)
2589           return
2590
2591         # TODO(maruel): The check is quite weak. Add the thunk path.
2592         if self._thunk_process:
2593           raise TracingFailure(
2594               ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2595                 'already set') % (self._thunk_pid, self._thunk_process.pid),
2596               None, None, None)
2597         proc = self.Process(self.blacklist, pid, None)
2598         self._thunk_process = proc
2599         return
2600       elif ppid == self._thunk_pid and self._thunk_process:
2601         proc = self.Process(self.blacklist, pid, None)
2602         self.root_process = proc
2603         ppid = None
2604       elif self._process_lookup.get(ppid):
2605         proc = self.Process(self.blacklist, pid, None)
2606         self._process_lookup[ppid].children.append(proc)
2607       else:
2608         # Ignore
2609         return
2610       self._process_lookup[pid] = proc
2611
2612       proc.command = command_line
2613       proc.executable = line[IMAGE_FILE_NAME]
2614       # proc.command[0] may be the absolute path of 'executable' but it may be
2615       # anything else too. If it happens that command[0] ends with executable,
2616       # use it, otherwise defaults to the base name.
2617       cmd0 = proc.command[0].lower()
2618       if not cmd0.endswith('.exe'):
2619         # TODO(maruel): That's not strictly true either.
2620         cmd0 += '.exe'
2621       if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2622         # Fix the path.
2623         cmd0 = cmd0.replace('/', os.path.sep)
2624         cmd0 = os.path.normpath(cmd0)
2625         proc.executable = file_path.get_native_path_case(cmd0)
2626       logging.info(
2627           'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2628
2629     def handle_Thread_End(self, line):
2630       """Has the same parameters as Thread_Start."""
2631       tid = int(line[self.TID], 16)
2632       self._threads_active.pop(tid, None)
2633
2634     def handle_Thread_Start(self, line):
2635       """Handles a new thread created.
2636
2637       Do not use self.PID here since a process' initial thread is created by
2638       the parent process.
2639       """
2640       PROCESS_ID = self.USER_DATA
2641       TTHREAD_ID = self.USER_DATA + 1
2642       #STACK_BASE = self.USER_DATA + 2
2643       #STACK_LIMIT = self.USER_DATA + 3
2644       #USER_STACK_BASE = self.USER_DATA + 4
2645       #USER_STACK_LIMIT = self.USER_DATA + 5
2646       #AFFINITY = self.USER_DATA + 6
2647       #WIN32_START_ADDR = self.USER_DATA + 7
2648       #TEB_BASE = self.USER_DATA + 8
2649       #SUB_PROCESS_TAG = self.USER_DATA + 9
2650       #BASE_PRIORITY = self.USER_DATA + 10
2651       #PAGE_PRIORITY = self.USER_DATA + 11
2652       #IO_PRIORITY = self.USER_DATA + 12
2653       #THREAD_FLAGS = self.USER_DATA + 13
2654       # Do not use self.PID here since a process' initial thread is created by
2655       # the parent process.
2656       pid = int(line[PROCESS_ID], 16)
2657       tid = int(line[TTHREAD_ID], 16)
2658       logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2659       self._threads_active[tid] = pid
2660
2661     @classmethod
2662     def supported_events(cls):
2663       """Returns all the procesed events."""
2664       out = []
2665       for member in dir(cls):
2666         match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2667         if match:
2668           out.append(match.groups())
2669       return out
2670
2671   class Tracer(ApiBase.Tracer):
2672     # The basic headers.
2673     EXPECTED_HEADER = [
2674       u'Event Name',
2675       u'Type',
2676       u'Event ID',
2677       u'Version',
2678       u'Channel',
2679       u'Level',  # 5
2680       u'Opcode',
2681       u'Task',
2682       u'Keyword',
2683       u'PID',
2684       u'TID',  # 10
2685       u'Processor Number',
2686       u'Instance ID',
2687       u'Parent Instance ID',
2688       u'Activity ID',
2689       u'Related Activity ID',  # 15
2690       u'Clock-Time',
2691       u'Kernel(ms)',  # Both have a resolution of ~15ms which makes them
2692       u'User(ms)',    # pretty much useless.
2693       u'User Data',   # Extra arguments that are event-specific.
2694     ]
2695     # Only the useful headers common to all entries are listed there. Any column
2696     # at 19 or higher is dependent on the specific event.
2697     EVENT_NAME = 0
2698     TYPE = 1
2699     PID = 9
2700     TID = 10
2701     PROCESSOR_ID = 11
2702     TIMESTAMP = 16
2703     NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2704     USER_DATA = 19
2705
2706     class CsvReader(object):
2707       """CSV reader that reads files generated by tracerpt.exe.
2708
2709       csv.reader() fails to read them properly, it mangles file names quoted
2710       with "" with a comma in it.
2711       """
2712         # 0. Had a ',' or one of the following ' ' after a comma, next should
2713         # be ' ', '"' or string or ',' for an empty field.
2714       ( HAD_DELIMITER,
2715         # 1. Processing an unquoted field up to ','.
2716         IN_STR,
2717         # 2. Processing a new field starting with '"'.
2718         STARTING_STR_QUOTED,
2719         # 3. Second quote in a row at the start of a field. It could be either
2720         # '""foo""' or '""'. Who the hell thought it was a great idea to use
2721         # the same character for delimiting and escaping?
2722         STARTING_SECOND_QUOTE,
2723         # 4. A quote inside a quoted string where the previous character was
2724         # not a quote, so the string is not empty. Can be either: end of a
2725         # quoted string (a delimiter) or a quote escape. The next char must be
2726         # either '"' or ','.
2727         HAD_QUOTE_IN_QUOTED,
2728         # 5. Second quote inside a quoted string.
2729         HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
2730         # 6. Processing a field that started with '"'.
2731         IN_STR_QUOTED) = range(7)
2732
2733       def __init__(self, f):
2734         self.f = f
2735
2736       def __iter__(self):
2737         return self
2738
2739       def next(self):
2740         """Splits the line in fields."""
2741         line = self.f.readline()
2742         if not line:
2743           raise StopIteration()
2744         line = line.strip()
2745         fields = []
2746         state = self.HAD_DELIMITER
2747         for i, c in enumerate(line):
2748           if state == self.HAD_DELIMITER:
2749             if c == ',':
2750               # Empty field.
2751               fields.append('')
2752             elif c == ' ':
2753               # Ignore initial whitespaces
2754               pass
2755             elif c == '"':
2756               state = self.STARTING_STR_QUOTED
2757               fields.append('')
2758             else:
2759               # Start of a new field.
2760               state = self.IN_STR
2761               fields.append(c)
2762
2763           elif state == self.IN_STR:
2764             # Do not accept quote inside unquoted field.
2765             assert c != '"', (i, c, line, fields)
2766             if c == ',':
2767               fields[-1] = fields[-1].strip()
2768               state = self.HAD_DELIMITER
2769             else:
2770               fields[-1] = fields[-1] + c
2771
2772           elif state == self.STARTING_STR_QUOTED:
2773             if c == '"':
2774               # Do not store the character yet.
2775               state = self.STARTING_SECOND_QUOTE
2776             else:
2777               state = self.IN_STR_QUOTED
2778               fields[-1] = fields[-1] + c
2779
2780           elif state == self.STARTING_SECOND_QUOTE:
2781             if c == ',':
2782               # It was an empty field. '""' == ''.
2783               state = self.HAD_DELIMITER
2784             else:
2785               fields[-1] = fields[-1] + '"' + c
2786               state = self.IN_STR_QUOTED
2787
2788           elif state == self.HAD_QUOTE_IN_QUOTED:
2789             if c == ',':
2790               # End of the string.
2791               state = self.HAD_DELIMITER
2792             elif c == '"':
2793               state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
2794             else:
2795               # The previous double-quote was just an unescaped quote.
2796               fields[-1] = fields[-1] + '"' + c
2797               state = self.IN_STR_QUOTED
2798
2799           elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2800             if c == ',':
2801               # End of the string.
2802               state = self.HAD_DELIMITER
2803               fields[-1] = fields[-1] + '"'
2804             else:
2805               # That's just how the logger rolls. Revert back to appending the
2806               # char and "guess" it was a quote in a double-quoted string.
2807               state = self.IN_STR_QUOTED
2808               fields[-1] = fields[-1] + '"' + c
2809
2810           elif state == self.IN_STR_QUOTED:
2811             if c == '"':
2812               # Could be a delimiter or an escape.
2813               state = self.HAD_QUOTE_IN_QUOTED
2814             else:
2815               fields[-1] = fields[-1] + c
2816
2817         if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2818           fields[-1] = fields[-1] + '"'
2819         else:
2820           assert state in (
2821               # Terminated with a normal field.
2822               self.IN_STR,
2823               # Terminated with an empty field.
2824               self.STARTING_SECOND_QUOTE,
2825               # Terminated with a normal quoted field.
2826               self.HAD_QUOTE_IN_QUOTED), (
2827               line, state, fields)
2828         return fields
2829
2830     def __init__(self, logname):
2831       """Starts the log collection.
2832
2833       Requires administrative access. logman.exe is synchronous so no need for a
2834       "warmup" call.  'Windows Kernel Trace' is *localized* so use its GUID
2835       instead.  The GUID constant name is SystemTraceControlGuid. Lovely.
2836
2837       One can get the list of potentially interesting providers with:
2838       "logman query providers | findstr /i file"
2839       """
2840       super(LogmanTrace.Tracer, self).__init__(logname)
2841       self._signal_script = create_subprocess_thunk()
2842       self._scripts_to_cleanup.append(self._signal_script)
2843       cmd_start = [
2844         'logman.exe',
2845         'start',
2846         'NT Kernel Logger',
2847         '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2848         # splitio,fileiocompletion,syscall,file,cswitch,img
2849         '(process,fileio,thread)',
2850         '-o', self._logname + '.etl',
2851         '-ets',  # Send directly to kernel
2852         # Values extracted out of thin air.
2853         # Event Trace Session buffer size in kb.
2854         '-bs', '10240',
2855         # Number of Event Trace Session buffers.
2856         '-nb', '16', '256',
2857       ]
2858       logging.debug('Running: %s' % cmd_start)
2859       try:
2860         subprocess.check_call(
2861             cmd_start,
2862             stdin=subprocess.PIPE,
2863             stdout=subprocess.PIPE,
2864             stderr=subprocess.STDOUT)
2865       except subprocess.CalledProcessError, e:
2866         if e.returncode == -2147024891:
2867           print >> sys.stderr, 'Please restart with an elevated admin prompt'
2868         elif e.returncode == -2144337737:
2869           print >> sys.stderr, (
2870               'A kernel trace was already running, stop it and try again')
2871         raise
2872
2873     def trace(self, cmd, cwd, tracename, output):
2874       logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2875       assert os.path.isabs(cmd[0]), cmd[0]
2876       assert os.path.isabs(cwd), cwd
2877       assert os.path.normpath(cwd) == cwd, cwd
2878       with self._lock:
2879         if not self._initialized:
2880           raise TracingFailure(
2881               'Called Tracer.trace() on an unitialized object',
2882               None, None, None, tracename)
2883         assert tracename not in (i['trace'] for i in self._traces)
2884
2885       # Use "logman -?" for help.
2886
2887       stdout = stderr = None
2888       if output:
2889         stdout = subprocess.PIPE
2890         stderr = subprocess.STDOUT
2891
2892       # Run the child process.
2893       logging.debug('Running: %s' % cmd)
2894       # Use the temporary script generated with create_subprocess_thunk() so we
2895       # have a clear pid owner. Since trace_inputs.py can be used as a library
2896       # and could trace multiple processes simultaneously, it makes it more
2897       # complex if the executable to be traced is executed directly here. It
2898       # also solves issues related to logman.exe that needs to be executed to
2899       # control the kernel trace.
2900       child_cmd = [
2901         sys.executable,
2902         self._signal_script,
2903         tracename,
2904       ]
2905       child = subprocess.Popen(
2906           child_cmd + tools.fix_python_path(cmd),
2907           cwd=cwd,
2908           stdin=subprocess.PIPE,
2909           stdout=stdout,
2910           stderr=stderr)
2911       logging.debug('Started child pid: %d' % child.pid)
2912       out = child.communicate()[0]
2913       # This doesn't mean all the grand-children are done. Sadly, we don't have
2914       # a good way to determine that.
2915
2916       with self._lock:
2917         assert tracename not in (i['trace'] for i in self._traces)
2918         self._traces.append({
2919           'cmd': cmd,
2920           'cwd': cwd,
2921           'output': out,
2922           'pid': child.pid,
2923           # Used to figure out the real process when process ids are reused.
2924           'thunk_cmd': child_cmd,
2925           'trace': tracename,
2926         })
2927
2928       return child.returncode, out
2929
2930     def close(self, _timeout=None):
2931       """Stops the kernel log collection and converts the traces to text
2932       representation.
2933       """
2934       with self._lock:
2935         try:
2936           super(LogmanTrace.Tracer, self).close()
2937         finally:
2938           cmd_stop = [
2939             'logman.exe',
2940             'stop',
2941             'NT Kernel Logger',
2942             '-ets',  # Sends the command directly to the kernel.
2943           ]
2944           logging.debug('Running: %s' % cmd_stop)
2945           subprocess.check_call(
2946               cmd_stop,
2947               stdin=subprocess.PIPE,
2948               stdout=subprocess.PIPE,
2949               stderr=subprocess.STDOUT)
2950
2951     def post_process_log(self):
2952       """Converts the .etl file into .csv then into .json."""
2953       super(LogmanTrace.Tracer, self).post_process_log()
2954       logformat = 'csv'
2955       self._convert_log(logformat)
2956       self._trim_log(logformat)
2957
2958     def _gen_logdata(self):
2959       return  {
2960         'format': 'csv',
2961         'traces': self._traces,
2962       }
2963
2964     def _trim_log(self, logformat):
2965       """Reduces the amount of data in original log by generating a 'reduced'
2966       log.
2967       """
2968       if logformat == 'csv_utf16':
2969         file_handle = codecs.open(
2970             self._logname + '.' + logformat, 'r', encoding='utf-16')
2971
2972       elif logformat == 'csv':
2973         assert sys.getfilesystemencoding() == 'mbcs'
2974         file_handle = codecs.open(
2975               self._logname + '.' + logformat, 'r',
2976               encoding=get_current_encoding())
2977
2978       supported_events = LogmanTrace.Context.supported_events()
2979
2980       def trim(generator):
2981         """Loads items from the generator and returns the interesting data.
2982
2983         It filters out any uninteresting line and reduce the amount of data in
2984         the trace.
2985         """
2986         for index, line in enumerate(generator):
2987           if not index:
2988             if line != self.EXPECTED_HEADER:
2989               raise TracingFailure(
2990                   'Found malformed header: %s' % line,
2991                   None, None, None)
2992             continue
2993           # As you can see, the CSV is full of useful non-redundant information:
2994           if (line[2] != '0' or  # Event ID
2995               line[3] not in ('2', '3') or  # Version
2996               line[4] != '0' or  # Channel
2997               line[5] != '0' or  # Level
2998               line[7] != '0' or  # Task
2999               line[8] != '0x0000000000000000' or  # Keyword
3000               line[12] != '' or  # Instance ID
3001               line[13] != '' or  # Parent Instance ID
3002               line[14] != self.NULL_GUID or  # Activity ID
3003               line[15] != ''):  # Related Activity ID
3004             raise TracingFailure(
3005                 'Found unexpected values in line: %s' % ' '.join(line),
3006                   None, None, None)
3007
3008           if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
3009             continue
3010
3011           yield [
3012               line[self.EVENT_NAME],
3013               line[self.TYPE],
3014               line[self.PID],
3015               line[self.TID],
3016               line[self.PROCESSOR_ID],
3017               line[self.TIMESTAMP],
3018           ] + line[self.USER_DATA:]
3019
3020       # must not convert the trim() call into a list, since it will use too much
3021       # memory for large trace. use a csv file as a workaround since the json
3022       # parser requires a complete in-memory file.
3023       with open('%s.preprocessed' % self._logname, 'wb') as f:
3024         # $ and * can't be used in file name on windows, reducing the likelihood
3025         # of having to escape a string.
3026         out = csv.writer(
3027             f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3028         for line in trim(self.CsvReader(file_handle)):
3029           out.writerow([s.encode('utf-8') for s in line])
3030
3031     def _convert_log(self, logformat):
3032       """Converts the ETL trace to text representation.
3033
3034       Normally, 'csv' is sufficient. If complex scripts are used (like eastern
3035       languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
3036
3037       Arguments:
3038         - logformat: Text format to be generated, csv, csv_utf16 or xml.
3039
3040       Use "tracerpt -?" for help.
3041       """
3042       LOCALE_INVARIANT = 0x7F
3043       windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
3044       cmd_convert = [
3045         'tracerpt.exe',
3046         '-l', self._logname + '.etl',
3047         '-o', self._logname + '.' + logformat,
3048         '-gmt',  # Use UTC
3049         '-y',  # No prompt
3050         # Use -of XML to get the header of each items after column 19, e.g. all
3051         # the actual headers of 'User Data'.
3052       ]
3053
3054       if logformat == 'csv':
3055         # tracerpt localizes the 'Type' column, for major brainfuck
3056         # entertainment. I can't imagine any sane reason to do that.
3057         cmd_convert.extend(['-of', 'CSV'])
3058       elif logformat == 'csv_utf16':
3059         # This causes it to use UTF-16, which doubles the log size but ensures
3060         # the log is readable for non-ASCII characters.
3061         cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
3062       elif logformat == 'xml':
3063         cmd_convert.extend(['-of', 'XML'])
3064       else:
3065         raise ValueError('Unexpected log format \'%s\'' % logformat)
3066       logging.debug('Running: %s' % cmd_convert)
3067       # This can takes tens of minutes for large logs.
3068       # Redirects all output to stderr.
3069       subprocess.check_call(
3070           cmd_convert,
3071           stdin=subprocess.PIPE,
3072           stdout=sys.stderr,
3073           stderr=sys.stderr)
3074
3075   def __init__(self, use_sudo=False):  # pylint: disable=W0613
3076     super(LogmanTrace, self).__init__()
3077     # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.
3078
3079   @staticmethod
3080   def clean_trace(logname):
3081     for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
3082       if os.path.isfile(logname + ext):
3083         os.remove(logname + ext)
3084
3085   @classmethod
3086   def parse_log(cls, logname, blacklist, trace_name):
3087     logging.info('parse_log(%s, ..., %s)', logname, trace_name)
3088     assert os.path.isabs(logname)
3089
3090     def blacklist_more(filepath):
3091       # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3092       return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3093
3094     # Create a list of (Context, result_dict) tuples. This is necessary because
3095     # the csv file may be larger than the amount of available memory.
3096     contexes = [
3097       (
3098         cls.Context(
3099             blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
3100         {
3101           'output': item['output'],
3102           'trace': item['trace'],
3103         },
3104       )
3105       for item in tools.read_json(logname)['traces']
3106       if not trace_name or item['trace'] == trace_name
3107     ]
3108
3109     # The log may be too large to fit in memory and it is not efficient to read
3110     # it multiple times, so multiplex the contexes instead, which is slightly
3111     # more awkward.
3112     with open('%s.preprocessed' % logname, 'rb') as f:
3113       lines = csv.reader(
3114           f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3115       for encoded in lines:
3116         line = [s.decode('utf-8') for s in encoded]
3117         # Convert the PID in-place from hex.
3118         line[cls.Context.PID] = int(line[cls.Context.PID], 16)
3119         for context in contexes:
3120           if 'exception' in context[1]:
3121             continue
3122           try:
3123             context[0].on_line(line)
3124           except TracingFailure:
3125             context[1]['exception'] = sys.exc_info()
3126
3127     for context in contexes:
3128       if 'exception' in context[1]:
3129         continue
3130       context[1]['results'] = context[0].to_results()
3131
3132     return [context[1] for context in contexes]
3133
3134
3135 def get_api(**kwargs):
3136   """Returns the correct implementation for the current OS."""
3137   if sys.platform == 'cygwin':
3138     raise NotImplementedError(
3139         'Not implemented for cygwin, start the script from Win32 python')
3140   flavors = {
3141     'win32': LogmanTrace,
3142     'darwin': Dtrace,
3143     'sunos5': Dtrace,
3144     'freebsd7': Dtrace,
3145     'freebsd8': Dtrace,
3146   }
3147   # Defaults to strace.
3148   return flavors.get(sys.platform, Strace)(**kwargs)
3149
3150
3151 def extract_directories(root_dir, files, blacklist):
3152   """Detects if all the files in a directory are in |files| and if so, replace
3153   the individual files by a Results.Directory instance.
3154
3155   Takes a list of Results.File instances and returns a shorter list of
3156   Results.File and Results.Directory instances.
3157
3158   Arguments:
3159     - root_dir: Optional base directory that shouldn't be search further.
3160     - files: list of Results.File instances.
3161     - blacklist: lambda to reject unneeded files, for example r'.+\.pyc'.
3162   """
3163   logging.info(
3164       'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3165   assert not (root_dir or '').endswith(os.path.sep), root_dir
3166   # It is important for root_dir to not be a symlinked path, make sure to call
3167   # os.path.realpath() as needed.
3168   assert not root_dir or (
3169       os.path.realpath(file_path.get_native_path_case(root_dir)) == root_dir)
3170   assert not any(isinstance(f, Results.Directory) for f in files)
3171   # Remove non existent files.
3172   files = [f for f in files if f.existent]
3173   if not files:
3174     return files
3175   # All files must share the same root, which can be None.
3176   assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3177
3178   # Creates a {directory: {filename: File}} mapping, up to root.
3179   buckets = {}
3180   if root_dir:
3181     buckets[root_dir] = {}
3182   for fileobj in files:
3183     path = fileobj.full_path
3184     directory = os.path.dirname(path)
3185     assert directory
3186     # Do not use os.path.basename() so trailing os.path.sep is kept.
3187     basename = path[len(directory)+1:]
3188     files_in_directory = buckets.setdefault(directory, {})
3189     files_in_directory[basename] = fileobj
3190     # Add all the directories recursively up to root.
3191     while True:
3192       old_d = directory
3193       directory = os.path.dirname(directory)
3194       if directory + os.path.sep == root_dir or directory == old_d:
3195         break
3196       buckets.setdefault(directory, {})
3197
3198   root_prefix = len(root_dir) + 1 if root_dir else 0
3199   for directory in sorted(buckets, reverse=True):
3200     if not os.path.isdir(directory):
3201       logging.debug(
3202           '%s was a directory but doesn\'t exist anymore; ignoring', directory)
3203       continue
3204     actual = set(f for f in os.listdir(directory) if not blacklist(f))
3205     expected = set(buckets[directory])
3206     if not (actual - expected):
3207       parent = os.path.dirname(directory)
3208       buckets[parent][os.path.basename(directory)] = Results.Directory(
3209         root_dir,
3210         directory[root_prefix:],
3211         False,
3212         sum(f.size for f in buckets[directory].itervalues()),
3213         sum(f.nb_files for f in buckets[directory].itervalues()))
3214       # Remove the whole bucket.
3215       del buckets[directory]
3216
3217   # Reverse the mapping with what remains. The original instances are returned,
3218   # so the cached meta data is kept.
3219   files = sum((x.values() for x in buckets.itervalues()), [])
3220   return sorted(files, key=lambda x: x.path)
3221
3222
3223 def trace(logfile, cmd, cwd, api, output):
3224   """Traces an executable. Returns (returncode, output) from api.
3225
3226   Arguments:
3227   - logfile: file to write to.
3228   - cmd: command to run.
3229   - cwd: current directory to start the process in.
3230   - api: a tracing api instance.
3231   - output: if True, returns output, otherwise prints it at the console.
3232   """
3233   cmd = tools.fix_python_path(cmd)
3234   api.clean_trace(logfile)
3235   with api.get_tracer(logfile) as tracer:
3236     return tracer.trace(cmd, cwd, 'default', output)
3237
3238
3239 def CMDclean(parser, args):
3240   """Cleans up traces."""
3241   options, args = parser.parse_args(args)
3242   api = get_api()
3243   api.clean_trace(options.log)
3244   return 0
3245
3246
3247 def CMDtrace(parser, args):
3248   """Traces an executable."""
3249   parser.allow_interspersed_args = False
3250   parser.add_option(
3251       '-q', '--quiet', action='store_true',
3252       help='Redirects traced executable output to /dev/null')
3253   parser.add_option(
3254       '-s', '--sudo', action='store_true',
3255       help='Use sudo when shelling out the tracer tool (ignored on Windows)')
3256   parser.add_option(
3257       '-n', '--no-sudo', action='store_false',
3258       help='Don\'t use sudo')
3259   options, args = parser.parse_args(args)
3260
3261   if not args:
3262     parser.error('Please provide a command to run')
3263
3264   if not can_trace():
3265     parser.error('Please rerun this program with admin privileges')
3266
3267   if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3268     args[0] = os.path.abspath(args[0])
3269
3270   # options.sudo default value is None, which is to do whatever tracer defaults
3271   # do.
3272   api = get_api(use_sudo=options.sudo)
3273   return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3274
3275
3276 def CMDread(parser, args):
3277   """Reads the logs and prints the result."""
3278   parser.add_option(
3279       '-V', '--variable',
3280       nargs=2,
3281       action='append',
3282       dest='variables',
3283       metavar='VAR_NAME directory',
3284       default=[],
3285       help=('Variables to replace relative directories against. Example: '
3286             '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3287             'home dir with $HOME') % getpass.getuser())
3288   parser.add_option(
3289       '--root-dir',
3290       help='Root directory to base everything off it. Anything outside of this '
3291            'this directory will not be reported')
3292   parser.add_option(
3293       '--trace-name',
3294       help='Only reads one of the trace. Defaults to reading all traces')
3295   parser.add_option(
3296       '-j', '--json', action='store_true',
3297       help='Outputs raw result data as json')
3298   parser.add_option(
3299       '--trace-blacklist', action='append', default=[],
3300       help='List of regexp to use as blacklist filter')
3301   options, args = parser.parse_args(args)
3302
3303   if options.root_dir:
3304     options.root_dir = file_path.get_native_path_case(
3305         unicode(os.path.abspath(options.root_dir)))
3306
3307   variables = dict(options.variables)
3308   api = get_api()
3309   blacklist = tools.gen_blacklist(options.trace_blacklist)
3310   data = api.parse_log(options.log, blacklist, options.trace_name)
3311   # Process each trace.
3312   output_as_json = []
3313   try:
3314     for item in data:
3315       if 'exception' in item:
3316         # Do not abort the other traces.
3317         print >> sys.stderr, (
3318             'Trace %s: Got an exception: %s' % (
3319               item['trace'], item['exception'][1]))
3320         continue
3321       results = item['results']
3322       if options.root_dir:
3323         results = results.strip_root(options.root_dir)
3324
3325       if options.json:
3326         output_as_json.append(results.flatten())
3327       else:
3328         simplified = extract_directories(
3329             options.root_dir, results.files, blacklist)
3330         simplified = [f.replace_variables(variables) for f in simplified]
3331         if len(data) > 1:
3332           print('Trace: %s' % item['trace'])
3333         print('Total: %d' % len(results.files))
3334         print('Non existent: %d' % len(results.non_existent))
3335         for f in results.non_existent:
3336           print('  %s' % f.path)
3337         print(
3338             'Interesting: %d reduced to %d' % (
3339                 len(results.existent), len(simplified)))
3340         for f in simplified:
3341           print('  %s' % f.path)
3342
3343     if options.json:
3344       tools.write_json(sys.stdout, output_as_json, False)
3345   except KeyboardInterrupt:
3346     return 1
3347   except IOError as e:
3348     if e.errno == errno.EPIPE:
3349       # Do not print a stack trace when the output is piped to less and the user
3350       # quits before the whole output was written.
3351       return 1
3352     raise
3353   return 0
3354
3355
3356 class OptionParserTraceInputs(tools.OptionParserWithLogging):
3357   """Adds automatic --log handling."""
3358
3359   # Disable --log-file options since both --log and --log-file options are
3360   # confusing.
3361   # TODO(vadimsh): Rename --log-file or --log to something else.
3362   enable_log_file = False
3363
3364   def __init__(self, **kwargs):
3365     tools.OptionParserWithLogging.__init__(self, **kwargs)
3366     self.add_option(
3367         '-l', '--log', help='Log file to generate or read, required')
3368
3369   def parse_args(self, *args, **kwargs):
3370     """Makes sure the paths make sense.
3371
3372     On Windows, / and \ are often mixed together in a path.
3373     """
3374     options, args = tools.OptionParserWithLogging.parse_args(
3375         self, *args, **kwargs)
3376     if not options.log:
3377       self.error('Must supply a log file with -l')
3378     options.log = os.path.abspath(options.log)
3379     return options, args
3380
3381
3382 def main(argv):
3383   dispatcher = subcommand.CommandDispatcher(__name__)
3384   try:
3385     return dispatcher.execute(
3386         OptionParserTraceInputs(version=__version__), argv)
3387   except TracingFailure, e:
3388     sys.stderr.write('\nError: ')
3389     sys.stderr.write(str(e))
3390     sys.stderr.write('\n')
3391     return 1
3392
3393
3394 if __name__ == '__main__':
3395   fix_encoding.fix_encoding()
3396   tools.disable_buffering()
3397   colorama.init()
3398   sys.exit(main(sys.argv[1:]))