3 # Copyright 2012 The Swarming Authors. All rights reserved.
4 # Use of this source code is governed under the Apache License, Version 2.0 that
5 # can be found in the LICENSE file.
7 """Traces an executable and its child processes and extract the files accessed
10 The implementation uses OS-specific API. The native Kernel logger and the ETL
11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12 The OS-specific implementation is hidden in an 'API' interface.
14 The results are embedded in a Results instance. The tracing is done in two
15 phases, the first is to do the actual trace and generate an
16 implementation-specific log file. Then the log file is parsed to extract the
17 information, including the individual child processes and the files accessed
37 from third_party import colorama
38 from third_party.depot_tools import fix_encoding
39 from third_party.depot_tools import subcommand
41 from utils import file_path
42 from utils import tools
44 ## OS-specific imports
46 if sys.platform == 'win32':
47 from ctypes.wintypes import byref, c_int, c_wchar_p
48 from ctypes.wintypes import windll # pylint: disable=E0611
54 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
55 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
58 class TracingFailure(Exception):
59 """An exception occured during tracing."""
60 def __init__(self, description, pid, line_number, line, *args):
61 super(TracingFailure, self).__init__(
62 description, pid, line_number, line, *args)
63 self.description = description
65 self.line_number = line_number
70 out = self.description
72 out += '\npid: %d' % self.pid
74 out += '\nline: %d' % self.line_number
76 out += '\n%s' % self.line
78 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
82 ## OS-specific functions
84 if sys.platform == 'win32':
85 def get_current_encoding():
86 """Returns the 'ANSI' code page associated to the process."""
87 return 'cp%d' % int(windll.kernel32.GetACP())
90 def CommandLineToArgvW(command_line):
91 """Splits a commandline into argv using CommandLineToArgvW()."""
92 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
94 assert isinstance(command_line, unicode)
95 ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
97 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
99 windll.kernel32.LocalFree(ptr)
103 """Returns True if the user is an administrator on Windows.
105 It is required for tracing to work.
107 if sys.platform != 'win32':
109 return bool(windll.shell32.IsUserAnAdmin())
112 def create_subprocess_thunk():
113 """Creates a small temporary script to start the child process.
115 This thunk doesn't block, its unique name is used to identify it as the
118 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
123 'import subprocess, sys\n'
124 'sys.exit(subprocess.call(sys.argv[2:]))\n'
131 def create_exec_thunk():
132 """Creates a small temporary script to start the child executable.
134 Reads from the file handle provided as the fisrt argument to block, then
135 execv() the command to be traced.
137 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
143 'fd = int(sys.argv[1])\n'
144 # This will block until the controlling process writes a byte on the
145 # pipe. It will do so once the tracing tool, e.g. strace, is ready to
149 'os.execv(sys.argv[2], sys.argv[2:])\n'
156 def strace_process_quoted_arguments(text):
157 """Extracts quoted arguments on a string and return the arguments as a list.
159 Implemented as an automaton. Supports incomplete strings in the form
163 With text = '"foo", "bar"', the function will return ['foo', 'bar']
165 TODO(maruel): Implement escaping.
167 # All the possible states of the DFA.
168 ( NEED_QUOTE, # Begining of a new arguments.
169 INSIDE_STRING, # Inside an argument.
170 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
171 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
172 # a serie of 3 dots or a comma.
173 NEED_SPACE, # Right after a comma
174 NEED_DOT_2, # Found a dot, need a second one.
175 NEED_DOT_3, # Found second dot, need a third one.
176 NEED_COMMA, # Found third dot, need a comma.
181 for index, char in enumerate(text):
183 if state == NEED_QUOTE:
184 state = INSIDE_STRING
185 # A new argument was found.
187 elif state == INSIDE_STRING:
188 # The argument is now closed.
189 state = NEED_COMMA_OR_DOT
190 elif state == ESCAPED:
192 state = INSIDE_STRING
195 'Can\'t process char \'%s\' at column %d for: %r' % (
200 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
202 elif state == INSIDE_STRING:
204 elif state == ESCAPED:
206 state = INSIDE_STRING
209 'Can\'t process char \'%s\' at column %d for: %r' % (
214 if state == NEED_SPACE:
216 elif state == INSIDE_STRING:
218 elif state == ESCAPED:
220 state = INSIDE_STRING
223 'Can\'t process char \'%s\' at column %d for: %r' % (
228 if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
229 # The string is incomplete, this mean the strace -s flag should be
231 # For NEED_QUOTE, the input string would look like '"foo", ...'.
232 # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
234 elif state == NEED_DOT_2:
236 elif state == NEED_DOT_3:
238 elif state == INSIDE_STRING:
240 elif state == ESCAPED:
242 state = INSIDE_STRING
245 'Can\'t process char \'%s\' at column %d for: %r' % (
252 state = INSIDE_STRING
253 elif state == INSIDE_STRING:
257 'Can\'t process char \'%s\' at column %d for: %r' % (
262 if state == INSIDE_STRING:
266 'Can\'t process char \'%s\' at column %d for: %r' % (
270 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
272 'String is incorrectly terminated: %r' % text,
277 def assert_is_renderable(pseudo_string):
278 """Asserts the input is a valid object to be processed by render()."""
280 pseudo_string is None or
281 isinstance(pseudo_string, unicode) or
282 hasattr(pseudo_string, 'render')), repr(pseudo_string)
285 def render(pseudo_string):
286 """Converts the pseudo-string to an unicode string."""
287 if pseudo_string is None or isinstance(pseudo_string, unicode):
289 return pseudo_string.render()
292 class Results(object):
293 """Results of a trace session."""
295 class _TouchedObject(object):
296 """Something, a file or a directory, that was accessed."""
297 def __init__(self, root, path, tainted, size, nb_files):
299 '%s(%s, %s, %s, %s, %s)' %
300 (self.__class__.__name__, root, path, tainted, size, nb_files))
301 assert_is_renderable(root)
302 assert_is_renderable(path)
305 self.tainted = tainted
306 self.nb_files = nb_files
307 # Can be used as a cache or a default value, depending on context. In
308 # particular, once self.tainted is True, because the path was replaced
309 # with a variable, it is not possible to look up the file size.
311 # These are cache only.
312 self._real_path = None
314 # Check internal consistency.
316 assert tainted or bool(root) != bool(file_path.isabs(path)), (root, path)
318 not os.path.exists(self.full_path) or
319 (self.full_path == file_path.get_native_path_case(self.full_path))), (
322 file_path.get_native_path_case(self.full_path))
326 return self.size != -1
331 return os.path.join(self.root, self.path)
336 """Returns the path with symlinks resolved."""
337 if not self._real_path:
338 self._real_path = os.path.realpath(self.full_path)
339 return self._real_path
343 """File's size. -1 is not existent.
345 Once tainted, it is not possible the retrieve the file size anymore since
346 the path is composed of variables.
348 if self._size is None and not self.tainted:
350 self._size = os.stat(self.full_path).st_size
356 """Returns a dict representing this object.
358 A 'size' of 0 means the file was only touched and not read.
365 def replace_variables(self, variables):
366 """Replaces the root of this File with one of the variables if it matches.
368 If a variable replacement occurs, the cloned object becomes tainted.
370 for variable, root_path in variables.iteritems():
371 if self.path.startswith(root_path):
373 self.root, variable + self.path[len(root_path):], True)
374 # No need to clone, returns ourself.
377 def strip_root(self, root):
378 """Returns a clone of itself with 'root' stripped off.
380 Note that the file is kept if it is either accessible from a symlinked
381 path that was used to access the file or through the real path.
383 # Check internal consistency.
386 (file_path.isabs(root) and root.endswith(os.path.sep))), root
387 if not self.full_path.startswith(root):
388 # Now try to resolve the symlinks to see if it can be reached this way.
389 # Only try *after* trying without resolving symlink.
390 if not self.real_path.startswith(root):
392 path = self.real_path
394 path = self.full_path
395 return self._clone(root, path[len(root):], self.tainted)
397 def _clone(self, new_root, new_path, tainted):
398 raise NotImplementedError(self.__class__.__name__)
400 class File(_TouchedObject):
401 """A file that was accessed. May not be present anymore.
403 If tainted is true, it means it is not a real path anymore as a variable
406 |mode| can be one of None, TOUCHED, READ or WRITE.
408 # Was probed for existence, and it is existent, but was never _opened_.
410 # Opened for read only and guaranteed to not have been written to.
415 # They are listed in order of priority. E.g. if a file is traced as TOUCHED
416 # then as WRITE, only keep WRITE. None means no idea, which is a problem on
418 ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE)
420 def __init__(self, root, path, tainted, size, mode):
421 assert mode in self.ACCEPTABLE_MODES
422 super(Results.File, self).__init__(root, path, tainted, size, 1)
425 def _clone(self, new_root, new_path, tainted):
426 """Clones itself keeping meta-data."""
427 # Keep the self.size and self._real_path caches for performance reason. It
428 # is also important when the file becomes tainted (with a variable instead
429 # of the real path) since self.path is not an on-disk path anymore so
430 # out._size cannot be updated.
431 out = self.__class__(new_root, new_path, tainted, self.size, self.mode)
432 out._real_path = self._real_path
436 out = super(Results.File, self).flatten()
437 out['mode'] = self.mode
440 class Directory(_TouchedObject):
441 """A directory of files. Must exist.
443 For a Directory instance, self.size is not a cache, it's an actual value
444 that is never modified and represents the total size of the files contained
445 in this directory. It is possible that the directory is empty so that
446 size==0; this happens if there's only an invalid symlink in it.
448 def __init__(self, root, path, tainted, size, nb_files):
449 """path='.' is a valid value and must be handled appropriately."""
450 assert not path.endswith(os.path.sep), path
451 super(Results.Directory, self).__init__(
452 root, path + os.path.sep, tainted, size, nb_files)
455 out = super(Results.Directory, self).flatten()
456 out['nb_files'] = self.nb_files
459 def _clone(self, new_root, new_path, tainted):
460 """Clones itself keeping meta-data."""
461 out = self.__class__(
463 new_path.rstrip(os.path.sep),
467 out._real_path = self._real_path
470 class Process(object):
471 """A process that was traced.
473 Contains references to the files accessed by this process and its children.
475 def __init__(self, pid, files, executable, command, initial_cwd, children):
476 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
478 self.files = sorted(files, key=lambda x: x.path)
479 self.children = children
480 self.executable = executable
481 self.command = command
482 self.initial_cwd = initial_cwd
484 # Check internal consistency.
485 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
486 f.path for f in self.files)
487 assert isinstance(self.children, list)
488 assert isinstance(self.files, list)
492 for child in self.children:
499 'children': [c.flatten() for c in self.children],
500 'command': self.command,
501 'executable': self.executable,
502 'files': [f.flatten() for f in self.files],
503 'initial_cwd': self.initial_cwd,
507 def strip_root(self, root):
508 assert file_path.isabs(root) and root.endswith(os.path.sep), root
509 # Loads the files after since they are constructed as objects.
510 out = self.__class__(
512 filter(None, (f.strip_root(root) for f in self.files)),
516 [c.strip_root(root) for c in self.children])
518 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
521 def __init__(self, process):
522 self.process = process
528 'root': self.process.flatten(),
533 if self._files is None:
534 self._files = sorted(
535 sum((p.files for p in self.process.all), []),
536 key=lambda x: x.path)
541 return [f for f in self.files if f.existent]
544 def non_existent(self):
545 return [f for f in self.files if not f.existent]
547 def strip_root(self, root):
548 """Returns a clone with all the files outside the directory |root| removed
549 and converts all the path to be relative paths.
551 It keeps files accessible through the |root| directory or that have been
552 accessed through any symlink which points to the same directory.
554 # Resolve any symlink
555 root = os.path.realpath(root)
557 file_path.get_native_path_case(root).rstrip(os.path.sep) + os.path.sep)
558 logging.debug('strip_root(%s)' % root)
559 return Results(self.process.strip_root(root))
562 class ApiBase(object):
563 """OS-agnostic API to trace a process and its children."""
564 class Context(object):
565 """Processes one log line at a time and keeps the list of traced processes.
567 The parsing is complicated by the fact that logs are traced out of order for
568 strace but in-order for dtrace and logman. In addition, on Windows it is
569 very frequent that processids are reused so a flat list cannot be used. But
570 at the same time, it is impossible to faithfully construct a graph when the
571 logs are processed out of order. So both a tree and a flat mapping are used,
572 the tree is the real process tree, while the flat mapping stores the last
573 valid process for the corresponding processid. For the strace case, the
574 tree's head is guessed at the last moment.
576 class Process(object):
577 """Keeps context for one traced child process.
579 Logs all the files this process touched. Ignores directories.
581 def __init__(self, blacklist, pid, initial_cwd):
582 # Check internal consistency.
583 assert isinstance(pid, int), repr(pid)
584 assert_is_renderable(initial_cwd)
586 # children are Process instances.
588 self.initial_cwd = initial_cwd
591 self.executable = None
593 self._blacklist = blacklist
595 def to_results_process(self):
596 """Resolves file case sensitivity and or late-bound strings."""
597 # When resolving files, it's normal to get dupe because a file could be
598 # opened multiple times with different case. Resolve the deduplication
601 """Returns the native file path case.
603 Converts late-bound strings.
606 # Do not convert None instance to 'None'.
610 # If the path is not absolute, which tends to happen occasionally on
611 # Windows, it is not possible to get the native path case so ignore
612 # that trace. It mostly happens for 'executable' value.
613 x = file_path.get_native_path_case(x)
616 def fix_and_blacklist_path(x, m):
617 """Receives a tuple (filepath, mode) and processes filepath."""
621 # The blacklist needs to be reapplied, since path casing could
622 # influence blacklisting.
623 if self._blacklist(x):
625 # Filters out directories. Some may have passed through.
630 # Renders all the files as strings, as some could be RelativePath
631 # instances. It is important to do it first since there could still be
632 # multiple entries with the same path but different modes.
634 fix_and_blacklist_path(f, m) for f, m in self.files.iteritems())
636 (f for f in rendered if f),
637 key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1])))
638 # Then converting into a dict will automatically clean up lesser
641 Results.File(None, f, False, None, m)
642 for f, m in dict(files).iteritems()
644 return Results.Process(
647 fix_path(self.executable),
649 fix_path(self.initial_cwd),
650 [c.to_results_process() for c in self.children])
652 def add_file(self, filepath, mode):
653 """Adds a file if it passes the blacklist."""
654 if self._blacklist(render(filepath)):
656 logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode)
657 # Note that filepath and not render(filepath) is added. It is because
658 # filepath could be something else than a string, like a RelativePath
659 # instance for dtrace logs.
660 modes = Results.File.ACCEPTABLE_MODES
661 old_mode = self.files.setdefault(filepath, mode)
662 if old_mode != mode and modes.index(old_mode) < modes.index(mode):
663 # Take the highest value.
664 self.files[filepath] = mode
666 def __init__(self, blacklist):
667 self.blacklist = blacklist
669 self.root_process = None
670 # dict to accelerate process lookup, to not have to lookup the whole graph
672 self._process_lookup = {}
674 class Tracer(object):
675 """During it's lifetime, the tracing subsystem is enabled."""
676 def __init__(self, logname):
677 self._logname = logname
678 self._lock = threading.RLock()
680 self._initialized = True
681 self._scripts_to_cleanup = []
683 def trace(self, cmd, cwd, tracename, output):
684 """Runs the OS-specific trace program on an executable.
687 - cmd: The command (a list) to run.
688 - cwd: Current directory to start the child process in.
689 - tracename: Name of the trace in the logname file.
690 - output: If False, redirects output to PIPEs.
692 Returns a tuple (resultcode, output) and updates the internal trace
695 # The implementation adds an item to self._traces.
696 raise NotImplementedError(self.__class__.__name__)
698 def close(self, _timeout=None):
699 """Saves the meta-data in the logname file.
701 For kernel-based tracing, stops the tracing subsystem.
703 Must not be used manually when using 'with' construct.
706 if not self._initialized:
707 raise TracingFailure(
708 'Called %s.close() on an unitialized object' %
709 self.__class__.__name__,
712 while self._scripts_to_cleanup:
714 os.remove(self._scripts_to_cleanup.pop())
716 logging.error('Failed to delete a temporary script: %s', e)
717 tools.write_json(self._logname, self._gen_logdata(), False)
719 self._initialized = False
721 def post_process_log(self):
722 """Post-processes the log so it becomes faster to load afterward.
724 Must not be used manually when using 'with' construct.
726 assert not self._initialized, 'Must stop tracing first.'
728 def _gen_logdata(self):
729 """Returns the data to be saved in the trace file."""
731 'traces': self._traces,
735 """Enables 'with' statement."""
738 def __exit__(self, exc_type, exc_value, traceback):
739 """Enables 'with' statement."""
741 # If an exception was thrown, do not process logs.
743 self.post_process_log()
745 def get_tracer(self, logname):
746 """Returns an ApiBase.Tracer instance.
748 Initializes the tracing subsystem, which is a requirement for kernel-based
749 tracers. Only one tracer instance should be live at a time!
751 logname is the filepath to the json file that will contain the meta-data
754 return self.Tracer(logname)
757 def clean_trace(logname):
758 """Deletes an old log."""
759 raise NotImplementedError()
762 def parse_log(cls, logname, blacklist, trace_name):
763 """Processes trace logs and returns the files opened and the files that do
766 It does not track directories.
769 - logname: must be an absolute path.
770 - blacklist: must be a lambda.
771 - trace_name: optional trace to read, defaults to reading all traces.
773 Most of the time, files that do not exist are temporary test files that
774 should be put in /tmp instead. See http://crbug.com/116251.
776 Returns a list of dict with keys:
777 - results: A Results instance.
778 - trace: The corresponding tracename parameter provided to
779 get_tracer().trace().
780 - output: Output gathered during execution, if get_tracer().trace(...,
781 output=False) was used.
783 raise NotImplementedError(cls.__class__.__name__)
786 class Strace(ApiBase):
787 """strace implies linux."""
789 def load_filename(filename):
790 """Parses a filename in a log."""
791 # TODO(maruel): Be compatible with strace -x.
792 assert isinstance(filename, str)
795 while i < len(filename):
798 out += chr(int(filename[i+1:i+4], 8))
803 # TODO(maruel): That's not necessarily true that the current code page is
805 return out.decode('utf-8')
807 class Context(ApiBase.Context):
808 """Processes a strace log line and keeps the list of existent and non
809 existent files accessed.
813 Uses late-binding to processes the cwd of each process. The problem is that
814 strace generates one log file per process it traced but doesn't give any
815 information about which process was started when and by who. So process the
816 logs out of order and use late binding with RelativePath to be able to
817 deduce the initial directory of each process once all the logs are parsed.
819 TODO(maruel): Use the script even in the non-sudo case, so log parsing can
820 be done in two phase: first find the root process, then process the child
821 processes in order. With that, it should be possible to not use RelativePath
822 anymore. This would significantly simplify the code!
824 class Process(ApiBase.Context.Process):
825 """Represents the state of a process.
827 Contains all the information retrieved from the pid-specific log.
829 # Function names are using ([a-z_0-9]+)
830 # This is the most common format. function(args) = result
831 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
832 # An interrupted function call, only grab the minimal header.
833 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
834 # A resumed function call.
835 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
836 # A process received a signal.
837 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
838 # A process didn't handle a signal. Ignore any junk appearing before,
839 # because the process was forcibly killed so it won't open any new file.
840 RE_KILLED = re.compile(
841 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
842 # The process has exited.
843 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
844 # A call was canceled. Ignore any prefix.
845 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
846 # Happens when strace fails to even get the function name.
847 UNNAMED_FUNCTION = '????'
849 # Corner-case in python, a class member function decorator must not be
851 def parse_args(regexp, expect_zero): # pylint: disable=E0213
852 """Automatically convert the str 'args' into a list of processed
856 - regexp is used to parse args.
857 - expect_zero: one of True, False or None.
858 - True: will check for result.startswith('0') first and will ignore
859 the trace line completely otherwise. This is important because for
860 many functions, the regexp will not process if the call failed.
861 - False: will check for not result.startswith(('?', '-1')) for the
862 same reason than with True.
863 - None: ignore result.
865 def meta_hook(function):
866 assert function.__name__.startswith('handle_')
867 def hook(self, args, result):
868 if expect_zero is True and not result.startswith('0'):
870 if expect_zero is False and result.startswith(('?', '-1')):
872 match = re.match(regexp, args)
874 raise TracingFailure(
875 'Failed to parse %s(%s) = %s' %
876 (function.__name__[len('handle_'):], args, result),
878 return function(self, match.groups(), result)
882 class RelativePath(object):
883 """A late-bound relative path."""
884 def __init__(self, parent, value):
885 assert_is_renderable(parent)
889 (isinstance(value, unicode) and not os.path.isabs(value)))
892 # TODO(maruel): On POSIX, '\\' is a valid character so remove this
894 assert '\\' not in self.value, value
895 assert '\\' not in self.value, (repr(value), repr(self.value))
898 """Returns the current directory this instance is representing.
900 This function is used to return the late-bound value.
902 assert self.parent is not None
903 parent = render(self.parent)
905 return os.path.normpath(os.path.join(parent, self.value))
908 def __init__(self, root, pid):
909 """Keeps enough information to be able to guess the original process
912 strace doesn't store which process was the initial process. So more
913 information needs to be kept so the graph can be reconstructed from the
916 logging.info('%s(%d)' % (self.__class__.__name__, pid))
917 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
918 assert isinstance(root, ApiBase.Context)
919 self._root = weakref.ref(root)
920 # The dict key is the function name of the pending call, like 'open'
922 self._pending_calls = {}
923 self._line_number = 0
924 # Current directory when the process started.
925 if isinstance(self._root(), unicode):
926 self.initial_cwd = self._root()
928 self.initial_cwd = self.RelativePath(self._root(), None)
933 """Returns the best known value of cwd."""
934 return self.cwd or self.initial_cwd
937 """Returns the string value of the RelativePath() object.
939 Used by RelativePath. Returns the initial directory and not the
940 current one since the current directory 'cwd' validity is time-limited.
942 The validity is only guaranteed once all the logs are processed.
944 return self.initial_cwd.render()
946 def on_line(self, line):
947 assert isinstance(line, str)
948 self._line_number += 1
951 raise TracingFailure(
952 'Found a trace for a terminated process or corrupted log',
955 if self.RE_SIGNAL.match(line):
959 match = self.RE_KILLED.match(line)
961 # Converts a '+++ killed by Foo +++' trace into an exit_group().
962 self.handle_exit_group(match.group(1), None)
965 match = self.RE_PROCESS_EXITED.match(line)
967 # Converts a '+++ exited with 1 +++' trace into an exit_group()
968 self.handle_exit_group(match.group(1), None)
971 match = self.RE_UNFINISHED.match(line)
973 if match.group(1) in self._pending_calls:
974 raise TracingFailure(
975 'Found two unfinished calls for the same function',
978 self._pending_calls[match.group(1)] = (
979 match.group(1) + match.group(2))
982 match = self.RE_UNAVAILABLE.match(line)
984 # This usually means a process was killed and a pending call was
986 # TODO(maruel): Look up the last exit_group() trace just above and
987 # make sure any self._pending_calls[anything] is properly flushed.
990 match = self.RE_RESUMED.match(line)
992 if match.group(1) not in self._pending_calls:
993 raise TracingFailure(
994 'Found a resumed call that was not logged as unfinished',
997 pending = self._pending_calls.pop(match.group(1))
998 # Reconstruct the line.
999 line = pending + match.group(2)
1001 match = self.RE_HEADER.match(line)
1003 # The line is corrupted. It happens occasionally when a process is
1004 # killed forcibly with activity going on. Assume the process died.
1005 # No other line can be processed afterward.
1006 logging.debug('%d is done: %s', self.pid, line)
1010 if match.group(1) == self.UNNAMED_FUNCTION:
1013 # It's a valid line, handle it.
1014 handler = getattr(self, 'handle_%s' % match.group(1), None)
1016 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1017 return handler(match.group(2), match.group(3))
1018 except TracingFailure, e:
1019 # Hack in the values since the handler could be a static function.
1022 e.line_number = self._line_number
1023 # Re-raise the modified exception.
1025 except (KeyError, NotImplementedError, ValueError), e:
1026 raise TracingFailure(
1027 'Trace generated a %s exception: %s' % (
1028 e.__class__.__name__, str(e)),
1034 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1035 def handle_access(self, args, _result):
1036 self._handle_file(args[0], Results.File.TOUCHED)
1038 @parse_args(r'^\"(.+?)\"$', True)
1039 def handle_chdir(self, args, _result):
1041 self.cwd = self._mangle(args[0])
1042 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1044 @parse_args(r'^\"(.+?)\", (\d+), (\d+)$', False)
1045 def handle_chown(self, args, _result):
1046 # TODO(maruel): Look at result?
1047 self._handle_file(args[0], Results.File.WRITE)
1049 def handle_clone(self, _args, result):
1050 self._handling_forking('clone', result)
1052 def handle_close(self, _args, _result):
1055 @parse_args(r'^\"(.+?)\", (\d+)$', False)
1056 def handle_chmod(self, args, _result):
1057 self._handle_file(args[0], Results.File.WRITE)
1059 @parse_args(r'^\"(.+?)\", (\d+)$', False)
1060 def handle_creat(self, args, _result):
1061 self._handle_file(args[0], Results.File.WRITE)
1063 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1064 def handle_execve(self, args, _result):
1065 # Even if in practice execve() doesn't returns when it succeeds, strace
1066 # still prints '0' as the result.
1068 self._handle_file(filepath, Results.File.READ)
1069 self.executable = self._mangle(filepath)
1071 self.command = strace_process_quoted_arguments(args[1])
1072 except ValueError as e:
1073 raise TracingFailure(
1074 'Failed to process command line argument:\n%s' % e.args[0],
1077 def handle_exit_group(self, _args, _result):
1081 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1082 def handle_faccessat(self, args, _results):
1083 if args[0] == 'AT_FDCWD':
1084 self._handle_file(args[1], Results.File.TOUCHED)
1086 raise Exception('Relative faccess not implemented.')
1088 def handle_fallocate(self, _args, result):
1091 def handle_fork(self, args, result):
1092 self._handle_unknown('fork', args, result)
1094 def handle_futex(self, _args, _result):
1097 @parse_args(r'^(\".+?\"|0x[a-f0-9]+), (\d+)$', False)
1098 def handle_getcwd(self, args, _result):
1099 # TODO(maruel): Resolve file handle.
1100 if not args[0].startswith('0x'):
1101 filepath = args[0][1:-1]
1102 if os.path.isabs(filepath):
1103 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1104 if not isinstance(self.cwd, unicode):
1105 # Take the occasion to reset the path.
1106 self.cwd = self._mangle(filepath)
1108 # It should always match.
1109 assert self.cwd == Strace.load_filename(filepath), (
1112 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1113 def handle_link(self, args, _result):
1114 self._handle_file(args[0], Results.File.READ)
1115 self._handle_file(args[1], Results.File.WRITE)
1117 @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{(?:.+?, )?\.\.\.\}$', True)
1118 def handle_lstat(self, args, _result):
1119 # TODO(maruel): Resolve file handle.
1120 if not args[0].startswith('0x'):
1121 self._handle_file(args[0][1:-1], Results.File.TOUCHED)
1123 def handle_mkdir(self, _args, _result):
1124 # We track content, not directories.
1127 @parse_args(r'^(\".*?\"|0x[a-f0-9]+), ([A-Z\_\|]+)(|, \d+)$', False)
1128 def handle_open(self, args, _result):
1129 if 'O_DIRECTORY' in args[1]:
1131 # TODO(maruel): Resolve file handle.
1132 if not args[0].startswith('0x'):
1133 t = Results.File.READ if 'O_RDONLY' in args[1] else Results.File.WRITE
1134 self._handle_file(args[0][1:-1], t)
1137 r'^(\d+|AT_FDCWD), (\".*?\"|0x[a-f0-9]+), ([A-Z\_\|]+)(|, \d+)$',
1139 def handle_openat(self, args, _result):
1140 if 'O_DIRECTORY' in args[2]:
1142 if args[0] == 'AT_FDCWD':
1143 # TODO(maruel): Resolve file handle.
1144 if not args[1].startswith('0x'):
1146 Results.File.READ if 'O_RDONLY' in args[2]
1147 else Results.File.WRITE)
1148 self._handle_file(args[1][1:-1], t)
1150 # TODO(maruel): Implement relative open if necessary instead of the
1151 # AT_FDCWD flag, let's hope not since this means tracking all active
1152 # directory handles.
1153 raise NotImplementedError('Relative open via openat not implemented.')
1156 r'^(\".+?\"|0x[a-f0-9]+), (?:\".+?\"(?:\.\.\.)?|0x[a-f0-9]+), '
1159 def handle_readlink(self, args, _result):
1160 # TODO(maruel): Resolve file handle.
1161 if not args[0].startswith('0x'):
1162 self._handle_file(args[0][1:-1], Results.File.READ)
1164 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1165 def handle_rename(self, args, _result):
1166 self._handle_file(args[0], Results.File.READ)
1167 self._handle_file(args[1], Results.File.WRITE)
1169 def handle_rmdir(self, _args, _result):
1170 # We track content, not directories.
1173 def handle_setxattr(self, _args, _result):
1174 # TODO(maruel): self._handle_file(args[0], Results.File.WRITE)
1177 @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{.+?\}$', True)
1178 def handle_stat(self, args, _result):
1179 # TODO(maruel): Resolve file handle.
1180 if not args[0].startswith('0x'):
1181 self._handle_file(args[0][1:-1], Results.File.TOUCHED)
1183 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1184 def handle_symlink(self, args, _result):
1185 self._handle_file(args[0], Results.File.TOUCHED)
1186 self._handle_file(args[1], Results.File.WRITE)
1188 @parse_args(r'^\"(.+?)\", \d+', True)
1189 def handle_truncate(self, args, _result):
1190 self._handle_file(args[0], Results.File.WRITE)
1192 def handle_unlink(self, _args, _result):
1193 # In theory, the file had to be created anyway.
1196 def handle_unlinkat(self, _args, _result):
1197 # In theory, the file had to be created anyway.
1200 def handle_statfs(self, _args, _result):
1203 def handle_utimensat(self, _args, _result):
1206 def handle_vfork(self, _args, result):
1207 self._handling_forking('vfork', result)
1210 def _handle_unknown(function, args, result):
1211 raise TracingFailure(
1212 'Unexpected/unimplemented trace %s(%s)= %s' %
1213 (function, args, result),
1216 def _handling_forking(self, name, result):
1217 """Transfers cwd."""
1218 if result.startswith(('?', '-1')):
1221 # Update the other process right away.
1222 childpid = int(result)
1223 child = self._root().get_or_set_proc(childpid)
1224 if child.parentid is not None or childpid in self.children:
1225 raise TracingFailure(
1226 'Found internal inconsitency in process lifetime detection '
1227 'during a %s() call' % name,
1230 # Copy the cwd object.
1231 child.initial_cwd = self.get_cwd()
1232 child.parentid = self.pid
1233 # It is necessary because the logs are processed out of order.
1234 self.children.append(child)
1236 def _handle_file(self, filepath, mode):
1237 filepath = self._mangle(filepath)
1238 self.add_file(filepath, mode)
1240 def _mangle(self, filepath):
1241 """Decodes a filepath found in the log and convert it to a late-bound
1244 |filepath| is an strace 'encoded' string and the returned value is
1245 either an unicode string if the path was absolute or a late bound path
1248 filepath = Strace.load_filename(filepath)
1249 if os.path.isabs(filepath):
1252 if isinstance(self.get_cwd(), unicode):
1253 return os.path.normpath(os.path.join(self.get_cwd(), filepath))
1254 return self.RelativePath(self.get_cwd(), filepath)
1256 def __init__(self, blacklist, root_pid, initial_cwd):
1257 """|root_pid| may be None when the root process is not known.
1259 In that case, a search is done after reading all the logs to figure out
1262 super(Strace.Context, self).__init__(blacklist)
1263 assert_is_renderable(initial_cwd)
1264 self.root_pid = root_pid
1265 self.initial_cwd = initial_cwd
1268 """Returns the string value of the initial cwd of the root process.
1270 Used by RelativePath.
1272 return self.initial_cwd
1274 def on_line(self, pid, line):
1275 """Transfers control into the Process.on_line() function."""
1276 self.get_or_set_proc(pid).on_line(line.strip())
1278 def to_results(self):
1279 """If necessary, finds back the root process and verify consistency."""
1280 if not self.root_pid:
1281 # The non-sudo case. The traced process was started by strace itself,
1282 # so the pid of the traced process is not known.
1283 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1285 self.root_process = root[0]
1286 # Save it for later.
1287 self.root_pid = self.root_process.pid
1289 # The sudo case. The traced process was started manually so its pid is
1291 self.root_process = self._process_lookup.get(self.root_pid)
1292 if not self.root_process:
1293 raise TracingFailure(
1294 'Found internal inconsitency in process lifetime detection '
1295 'while finding the root process',
1300 sorted(self._process_lookup))
1301 process = self.root_process.to_results_process()
1302 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1303 raise TracingFailure(
1304 'Found internal inconsitency in process lifetime detection '
1305 'while looking for len(tree) == len(list)',
1309 sorted(self._process_lookup),
1310 sorted(p.pid for p in process.all))
1311 return Results(process)
1313 def get_or_set_proc(self, pid):
1314 """Returns the Context.Process instance for this pid or creates a new one.
1316 if not pid or not isinstance(pid, int):
1317 raise TracingFailure(
1318 'Unpexpected value for pid: %r' % pid,
1323 if pid not in self._process_lookup:
1324 self._process_lookup[pid] = self.Process(self, pid)
1325 return self._process_lookup[pid]
1329 """Returns the list of all handled traces to pass this as an argument to
1333 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1335 class Tracer(ApiBase.Tracer):
1338 def __init__(self, logname, use_sudo):
1339 super(Strace.Tracer, self).__init__(logname)
1340 self.use_sudo = use_sudo
1342 # TODO(maruel): Use the jump script systematically to make it easy to
1343 # figure out the root process, so RelativePath is not necessary anymore.
1344 self._child_script = create_exec_thunk()
1345 self._scripts_to_cleanup.append(self._child_script)
1347 def trace(self, cmd, cwd, tracename, output):
1348 """Runs strace on an executable.
1350 When use_sudo=True, it is a 3-phases process: start the thunk, start
1351 sudo strace with the pid of the thunk and then have the thunk os.execve()
1352 the process to trace.
1354 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1355 assert os.path.isabs(cmd[0]), cmd[0]
1356 assert os.path.isabs(cwd), cwd
1357 assert os.path.normpath(cwd) == cwd, cwd
1359 if not self._initialized:
1360 raise TracingFailure(
1361 'Called Tracer.trace() on an unitialized object',
1362 None, None, None, tracename)
1363 assert tracename not in (i['trace'] for i in self._traces)
1364 stdout = stderr = None
1366 stdout = subprocess.PIPE
1367 stderr = subprocess.STDOUT
1369 # Ensure all file related APIs are hooked.
1370 traces = ','.join(Strace.Context.traces() + ['file'])
1372 # Each child process has its own trace file. It is necessary because
1373 # strace may generate corrupted log file if multiple processes are
1374 # heavily doing syscalls simultaneously.
1376 # Reduce whitespace usage.
1378 # hex encode non-ascii strings.
1379 # TODO(maruel): '-x',
1380 # TODO(maruel): '-ttt',
1381 # Signals are unnecessary noise here. Note the parser can cope with them
1382 # but reduce the unnecessary output.
1384 # Print as much data as wanted.
1385 '-s', '%d' % self.MAX_LEN,
1386 '-e', 'trace=%s' % traces,
1387 '-o', self._logname + '.' + tracename,
1391 pipe_r, pipe_w = os.pipe()
1392 # Start the child process paused.
1393 target_cmd = [sys.executable, self._child_script, str(pipe_r)] + cmd
1394 logging.debug(' '.join(target_cmd))
1395 child_proc = subprocess.Popen(
1397 stdin=subprocess.PIPE,
1402 # TODO(maruel): both processes must use the same UID for it to work
1403 # without sudo. Look why -p is failing at the moment without sudo.
1407 '-p', str(child_proc.pid),
1409 logging.debug(' '.join(trace_cmd))
1410 strace_proc = subprocess.Popen(
1413 stdin=subprocess.PIPE,
1414 stdout=subprocess.PIPE,
1415 stderr=subprocess.PIPE)
1417 line = strace_proc.stderr.readline()
1418 if not re.match(r'^Process \d+ attached \- interrupt to quit$', line):
1419 # TODO(maruel): Raise an exception.
1422 # Now fire the child process.
1423 os.write(pipe_w, 'x')
1425 out = child_proc.communicate()[0]
1426 strace_out = strace_proc.communicate()[0]
1428 # TODO(maruel): if strace_proc.returncode: Add an exception.
1429 saved_out = strace_out if strace_proc.returncode else out
1430 root_pid = child_proc.pid
1436 logging.debug(' '.join(trace_cmd))
1437 child_proc = subprocess.Popen(
1440 stdin=subprocess.PIPE,
1443 out = child_proc.communicate()[0]
1444 # TODO(maruel): Walk the logs and figure out the root process would
1445 # simplify parsing the logs a *lot*.
1447 # The trace reader will have to figure out.
1451 assert tracename not in (i['trace'] for i in self._traces)
1452 self._traces.append(
1456 'output': saved_out,
1460 return child_proc.returncode, out
1462 def __init__(self, use_sudo=None):
1463 super(Strace, self).__init__()
1464 self.use_sudo = use_sudo
1466 def get_tracer(self, logname):
1467 return self.Tracer(logname, self.use_sudo)
1470 def clean_trace(logname):
1471 if os.path.isfile(logname):
1473 # Also delete any pid specific file from previous traces.
1474 for i in glob.iglob(logname + '.*'):
1475 if i.rsplit('.', 1)[1].isdigit():
1479 def parse_log(cls, logname, blacklist, trace_name):
1480 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
1481 assert os.path.isabs(logname)
1482 data = tools.read_json(logname)
1484 for item in data['traces']:
1485 if trace_name and item['trace'] != trace_name:
1488 'output': item['output'],
1489 'trace': item['trace'],
1492 context = cls.Context(blacklist, item['pid'], item['cwd'])
1493 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1494 logging.debug('Reading %s', pidfile)
1495 pid = pidfile.rsplit('.', 1)[1]
1499 for line in open(pidfile, 'rb'):
1500 context.on_line(pid, line)
1503 # Ensures that a completely empty trace still creates the
1504 # corresponding Process instance by logging a dummy line.
1505 context.on_line(pid, '')
1507 logging.warning('Found unexpected file %s', pidfile)
1508 result['results'] = context.to_results()
1509 except TracingFailure:
1510 result['exception'] = sys.exc_info()
1515 class Dtrace(ApiBase):
1516 """Uses DTrace framework through dtrace. Requires root access.
1520 dtruss can't be used because it has compatibility issues with python.
1522 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1523 get the absolute path of the 'cwd' dtrace variable from the probe.
1525 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1526 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1529 errno is not printed in the log since this implementation currently only cares
1530 about files that were successfully opened.
1532 class Context(ApiBase.Context):
1533 # Format: index pid function(args)
1534 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1536 # Arguments parsing.
1537 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1538 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1539 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1540 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1541 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1542 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1544 # O_DIRECTORY is not defined on Windows and dtrace doesn't exist on Windows.
1545 O_DIRECTORY = os.O_DIRECTORY if hasattr(os, 'O_DIRECTORY') else None
1547 O_WRONLY = os.O_WRONLY
1549 class Process(ApiBase.Context.Process):
1550 def __init__(self, *args):
1551 super(Dtrace.Context.Process, self).__init__(*args)
1552 self.cwd = self.initial_cwd
1554 def __init__(self, blacklist, thunk_pid, initial_cwd):
1556 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
1557 super(Dtrace.Context, self).__init__(blacklist)
1558 assert isinstance(initial_cwd, unicode), initial_cwd
1559 # Process ID of the temporary script created by create_subprocess_thunk().
1560 self._thunk_pid = thunk_pid
1561 self._initial_cwd = initial_cwd
1562 self._line_number = 0
1564 def on_line(self, line):
1565 assert isinstance(line, unicode), line
1566 self._line_number += 1
1567 match = self.RE_HEADER.match(line)
1569 raise TracingFailure(
1570 'Found malformed line: %s' % line,
1576 'handle_%s' % match.group(2).replace('-', '_'),
1577 self._handle_ignored)
1578 # It is guaranteed to succeed because of the regexp. Or at least I thought
1580 pid = int(match.group(1))
1582 return fn(pid, match.group(3))
1583 except TracingFailure, e:
1584 # Hack in the values since the handler could be a static function.
1587 e.line_number = self._line_number
1588 # Re-raise the modified exception.
1590 except (KeyError, NotImplementedError, ValueError), e:
1591 raise TracingFailure(
1592 'Trace generated a %s exception: %s' % (
1593 e.__class__.__name__, str(e)),
1599 def to_results(self):
1600 process = self.root_process.to_results_process()
1601 # Internal concistency check.
1602 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1603 raise TracingFailure(
1604 'Found internal inconsitency in process lifetime detection '
1605 'while looking for len(tree) == len(list)',
1609 sorted(self._process_lookup),
1610 sorted(p.pid for p in process.all))
1611 return Results(process)
1613 def handle_dtrace_BEGIN(self, _pid, args):
1614 if not self.RE_DTRACE_BEGIN.match(args):
1615 raise TracingFailure(
1616 'Found internal inconsitency in dtrace_BEGIN log line',
1619 def handle_proc_start(self, pid, args):
1622 The dtrace script already takes care of only tracing the processes that
1623 are child of the traced processes so there is no need to verify the
1626 if pid in self._process_lookup:
1627 raise TracingFailure(
1628 'Found internal inconsitency in proc_start: %d started two times' %
1631 match = self.RE_PROC_START.match(args)
1633 raise TracingFailure(
1634 'Failed to parse arguments: %s' % args,
1636 ppid = int(match.group(1))
1637 if ppid == self._thunk_pid and not self.root_process:
1638 proc = self.root_process = self.Process(
1639 self.blacklist, pid, self._initial_cwd)
1640 elif ppid in self._process_lookup:
1641 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1642 self._process_lookup[ppid].children.append(proc)
1644 # Another process tree, ignore.
1646 self._process_lookup[pid] = proc
1648 'New child: %s -> %d cwd:%s' %
1649 (ppid, pid, render(proc.initial_cwd)))
1651 def handle_proc_exit(self, pid, _args):
1653 if pid in self._process_lookup:
1654 # self._thunk_pid is not traced itself and other traces run neither.
1655 self._process_lookup[pid].cwd = None
1657 def handle_execve(self, pid, args):
1658 """Sets the process' executable.
1660 TODO(maruel): Read command line arguments. See
1661 https://discussions.apple.com/thread/1980539 for an example.
1662 https://gist.github.com/1242279
1664 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1667 if not pid in self._process_lookup:
1668 # Another process tree, ignore.
1670 match = self.RE_EXECVE.match(args)
1672 raise TracingFailure(
1673 'Failed to parse arguments: %r' % args,
1675 proc = self._process_lookup[pid]
1676 proc.executable = match.group(1)
1677 self._handle_file(pid, proc.executable, Results.File.READ)
1678 proc.command = self.process_escaped_arguments(match.group(3))
1679 if int(match.group(2)) != len(proc.command):
1680 raise TracingFailure(
1681 'Failed to parse execve() arguments: %s' % args,
1684 def handle_chdir(self, pid, args):
1686 if pid not in self._process_lookup:
1687 # Another process tree, ignore.
1689 cwd = self.RE_CHDIR.match(args).group(1)
1690 if not cwd.startswith('/'):
1691 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1692 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1694 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1696 self._process_lookup[pid].cwd = cwd2
1698 def handle_open_nocancel(self, pid, args):
1699 """Redirects to handle_open()."""
1700 return self.handle_open(pid, args)
1702 def handle_open(self, pid, args):
1703 if pid not in self._process_lookup:
1704 # Another process tree, ignore.
1706 match = self.RE_OPEN.match(args)
1708 raise TracingFailure(
1709 'Failed to parse arguments: %s' % args,
1711 flag = int(match.group(2), 16)
1712 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1713 # Ignore directories.
1718 Results.File.READ if not ((self.O_RDWR | self.O_WRONLY) & flag)
1719 else Results.File.WRITE)
1721 def handle_rename(self, pid, args):
1722 if pid not in self._process_lookup:
1723 # Another process tree, ignore.
1725 match = self.RE_RENAME.match(args)
1727 raise TracingFailure(
1728 'Failed to parse arguments: %s' % args,
1730 self._handle_file(pid, match.group(1), Results.File.READ)
1731 self._handle_file(pid, match.group(2), Results.File.WRITE)
1733 def _handle_file(self, pid, filepath, mode):
1734 if not filepath.startswith('/'):
1735 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1736 # We can get '..' in the path.
1737 filepath = os.path.normpath(filepath)
1738 # Sadly, still need to filter out directories here;
1739 # saw open_nocancel(".", 0, 0) = 0 lines.
1740 if os.path.isdir(filepath):
1742 self._process_lookup[pid].add_file(filepath, mode)
1744 def handle_ftruncate(self, pid, args):
1745 """Just used as a signal to kill dtrace, ignoring."""
1749 def _handle_ignored(pid, args):
1750 """Is called for all the event traces that are not handled."""
1751 raise NotImplementedError('Please implement me')
1754 def process_escaped_arguments(text):
1755 """Extracts escaped arguments on a string and return the arguments as a
1758 Implemented as an automaton.
1761 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1762 function will return ['python2.7', '-c', 'print("hi")]
1764 if not text.endswith('\\0'):
1765 raise ValueError('String is not null terminated: %r' % text, text)
1769 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1773 if i == '\\' and not escaped:
1780 return [unescape(i) for i in text.split('\\001')]
1782 class Tracer(ApiBase.Tracer):
1783 # pylint: disable=C0301
1785 # To understand the following code, you'll want to take a look at:
1786 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1787 # https://wikis.oracle.com/display/DTrace/Variables
1788 # http://docs.oracle.com/cd/E19205-01/820-4221/
1790 # 0. Dump all the valid probes into a text file. It is important, you
1791 # want to redirect into a file and you don't want to constantly 'sudo'.
1792 # $ sudo dtrace -l > probes.txt
1794 # 1. Count the number of probes:
1795 # $ wc -l probes.txt
1796 # 81823 # On OSX 10.7, including 1 header line.
1798 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1799 # likes and skipping the header with NR>1:
1800 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1814 # 3. List of valid probes:
1815 # $ grep syscall probes.txt | less
1816 # or use dtrace directly:
1817 # $ sudo dtrace -l -P syscall | less
1819 # trackedpid is an associative array where its value can be 0, 1 or 2.
1820 # 0 is for untracked processes and is the default value for items not
1821 # in the associative array.
1822 # 1 is for tracked processes.
1823 # 2 is for the script created by create_subprocess_thunk() only. It is not
1824 # tracked itself but all its decendants are.
1826 # The script will kill itself only once waiting_to_die == 1 and
1827 # current_processes == 0, so that both getlogin() was called and that
1828 # all traced processes exited.
1830 # TODO(maruel): Use cacheable predicates. See
1831 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1835 current_processes = 0;
1837 printf("%d %d %s_%s(\\"%s\\")\\n",
1838 logindex, PID, probeprov, probename, SCRIPT);
1842 proc:::start /trackedpid[ppid]/ {
1843 trackedpid[pid] = 1;
1844 current_processes += 1;
1845 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1846 logindex, pid, probeprov, probename,
1852 /* Should use SCRIPT but there is no access to this variable at that
1854 proc:::start /ppid == PID && execname == "Python"/ {
1855 trackedpid[pid] = 2;
1856 current_processes += 1;
1857 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1858 logindex, pid, probeprov, probename,
1864 proc:::exit /trackedpid[pid] &&
1865 current_processes == 1 &&
1866 waiting_to_die == 1/ {
1867 trackedpid[pid] = 0;
1868 current_processes -= 1;
1869 printf("%d %d %s_%s(%d)\\n",
1870 logindex, pid, probeprov, probename,
1875 proc:::exit /trackedpid[pid]/ {
1876 trackedpid[pid] = 0;
1877 current_processes -= 1;
1878 printf("%d %d %s_%s(%d)\\n",
1879 logindex, pid, probeprov, probename,
1884 /* Use an arcane function to detect when we need to die */
1885 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
1887 printf("%d %d %s()\\n", logindex, pid, probefunc);
1890 syscall::ftruncate:entry /
1891 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
1895 syscall::open*:entry /trackedpid[pid] == 1/ {
1896 self->open_arg0 = arg0;
1897 self->open_arg1 = arg1;
1898 self->open_arg2 = arg2;
1900 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
1901 this->open_arg0 = copyinstr(self->open_arg0);
1902 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
1903 logindex, pid, probefunc,
1908 this->open_arg0 = 0;
1910 syscall::open*:return /trackedpid[pid] == 1/ {
1911 self->open_arg0 = 0;
1912 self->open_arg1 = 0;
1913 self->open_arg2 = 0;
1916 syscall::rename:entry /trackedpid[pid] == 1/ {
1917 self->rename_arg0 = arg0;
1918 self->rename_arg1 = arg1;
1920 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
1921 this->rename_arg0 = copyinstr(self->rename_arg0);
1922 this->rename_arg1 = copyinstr(self->rename_arg1);
1923 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
1924 logindex, pid, probefunc,
1928 this->rename_arg0 = 0;
1929 this->rename_arg1 = 0;
1931 syscall::rename:return /trackedpid[pid] == 1/ {
1932 self->rename_arg0 = 0;
1933 self->rename_arg1 = 0;
1936 /* Track chdir, it's painful because it is only receiving relative path.
1938 syscall::chdir:entry /trackedpid[pid] == 1/ {
1939 self->chdir_arg0 = arg0;
1941 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
1942 this->chdir_arg0 = copyinstr(self->chdir_arg0);
1943 printf("%d %d %s(\\"%s\\")\\n",
1944 logindex, pid, probefunc,
1947 this->chdir_arg0 = 0;
1949 syscall::chdir:return /trackedpid[pid] == 1/ {
1950 self->chdir_arg0 = 0;
1954 # execve-specific code, tends to throw a lot of exceptions.
1956 /* Finally what we care about! */
1957 syscall::exec*:entry /trackedpid[pid]/ {
1958 self->exec_arg0 = copyinstr(arg0);
1959 /* Incrementally probe for a NULL in the argv parameter of execve() to
1960 * figure out argc. */
1961 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
1963 self->exec_argc = 0;
1964 /* Probe for argc==1 */
1965 this->exec_argv = (user_addr_t*)copyin(
1966 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1967 self->exec_argc = this->exec_argv[self->exec_argc] ?
1968 (self->exec_argc + 1) : self->exec_argc;
1970 /* Probe for argc==2 */
1971 this->exec_argv = (user_addr_t*)copyin(
1972 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1973 self->exec_argc = this->exec_argv[self->exec_argc] ?
1974 (self->exec_argc + 1) : self->exec_argc;
1976 /* Probe for argc==3 */
1977 this->exec_argv = (user_addr_t*)copyin(
1978 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1979 self->exec_argc = this->exec_argv[self->exec_argc] ?
1980 (self->exec_argc + 1) : self->exec_argc;
1982 /* Probe for argc==4 */
1983 this->exec_argv = (user_addr_t*)copyin(
1984 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1985 self->exec_argc = this->exec_argv[self->exec_argc] ?
1986 (self->exec_argc + 1) : self->exec_argc;
1988 /* Copy the inputs strings since there is no guarantee they'll be
1989 * present after the call completed. */
1990 self->exec_argv0 = (self->exec_argc > 0) ?
1991 copyinstr(this->exec_argv[0]) : "";
1992 self->exec_argv1 = (self->exec_argc > 1) ?
1993 copyinstr(this->exec_argv[1]) : "";
1994 self->exec_argv2 = (self->exec_argc > 2) ?
1995 copyinstr(this->exec_argv[2]) : "";
1996 self->exec_argv3 = (self->exec_argc > 3) ?
1997 copyinstr(this->exec_argv[3]) : "";
1998 this->exec_argv = 0;
2000 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2001 /* We need to join strings here, as using multiple printf() would
2002 * cause tearing when multiple threads/processes are traced.
2003 * Since it is impossible to escape a string and join it to another one,
2004 * like sprintf("%s%S", previous, more), use hackery.
2005 * Each of the elements are split with a \\1. \\0 cannot be used because
2006 * it is simply ignored. This will conflict with any program putting a
2007 * \\1 in their execve() string but this should be "rare enough" */
2009 /* Process exec_argv[0] */
2010 this->args = strjoin(
2011 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2013 /* Process exec_argv[1] */
2014 this->args = strjoin(
2015 this->args, (self->exec_argc > 1) ? "\\1" : "");
2016 this->args = strjoin(
2017 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2019 /* Process exec_argv[2] */
2020 this->args = strjoin(
2021 this->args, (self->exec_argc > 2) ? "\\1" : "");
2022 this->args = strjoin(
2023 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2025 /* Process exec_argv[3] */
2026 this->args = strjoin(
2027 this->args, (self->exec_argc > 3) ? "\\1" : "");
2028 this->args = strjoin(
2029 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2031 /* Prints self->exec_argc to permits verifying the internal
2032 * consistency since this code is quite fishy. */
2033 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2034 logindex, pid, probefunc,
2041 syscall::exec*:return /trackedpid[pid]/ {
2042 self->exec_arg0 = 0;
2043 self->exec_argc = 0;
2044 self->exec_argv0 = 0;
2045 self->exec_argv1 = 0;
2046 self->exec_argv2 = 0;
2047 self->exec_argv3 = 0;
2051 # Code currently not used.
2053 /* This is a good learning experience, since it traces a lot of things
2054 * related to the process and child processes.
2055 * Warning: it generates a gigantic log. For example, tracing
2056 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2057 * several minutes to execute.
2060 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2061 printf("%d %d %s_%s() = %d\\n",
2062 logindex, pid, probeprov, probefunc, errno);
2065 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2066 printf("%d %d %s_%s() = %d\\n",
2067 logindex, pid, probeprov, probefunc, errno);
2070 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2071 printf("%d %d %s_%s() = %d\\n",
2072 logindex, pid, probeprov, probefunc, errno);
2075 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2076 printf("%d %d %s_%s() = %d\\n",
2077 logindex, pid, probeprov, probefunc, errno);
2080 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2081 printf("%d %d %s_%s() = %d\\n",
2082 logindex, pid, probeprov, probefunc, errno);
2086 /* TODO(maruel): *stat* functions and friends
2087 syscall::access:return,
2088 syscall::chdir:return,
2089 syscall::chflags:return,
2090 syscall::chown:return,
2091 syscall::chroot:return,
2092 syscall::getattrlist:return,
2093 syscall::getxattr:return,
2094 syscall::lchown:return,
2095 syscall::lstat64:return,
2096 syscall::lstat:return,
2097 syscall::mkdir:return,
2098 syscall::pathconf:return,
2099 syscall::readlink:return,
2100 syscall::removexattr:return,
2101 syscall::setxattr:return,
2102 syscall::stat64:return,
2103 syscall::stat:return,
2104 syscall::truncate:return,
2105 syscall::unlink:return,
2106 syscall::utimes:return,
2110 def __init__(self, logname, use_sudo):
2111 """Starts the log collection with dtrace.
2113 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2114 this needs to wait for dtrace to be "warmed up".
2116 super(Dtrace.Tracer, self).__init__(logname)
2117 # This script is used as a signal to figure out the root process.
2118 self._signal_script = create_subprocess_thunk()
2119 self._scripts_to_cleanup.append(self._signal_script)
2120 # This unique dummy temp file is used to signal the dtrace script that it
2121 # should stop as soon as all the child processes are done. A bit hackish
2122 # but works fine enough.
2123 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2124 prefix='trace_signal_file')
2126 dtrace_path = '/usr/sbin/dtrace'
2127 if not os.path.isfile(dtrace_path):
2128 dtrace_path = 'dtrace'
2129 elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID):
2130 # No need to sudo. For those following at home, don't do that.
2133 # Note: do not use the -p flag. It's useless if the initial process quits
2134 # too fast, resulting in missing traces from the grand-children. The D
2135 # code manages the dtrace lifetime itself.
2138 # Use a larger buffer if getting 'out of scratch space' errors.
2139 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2141 '-x', 'dynvarsize=10m',
2142 #'-x', 'dtrace_global_maxsize=1m',
2143 '-x', 'evaltime=exec',
2144 '-o', '/dev/stderr',
2146 '-n', self._get_dtrace_code(),
2148 if use_sudo is not False:
2149 trace_cmd.insert(0, 'sudo')
2151 with open(self._logname + '.log', 'wb') as logfile:
2152 self._dtrace = subprocess.Popen(
2153 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2154 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2156 # Reads until one line is printed, which signifies dtrace is up and ready.
2157 with open(self._logname + '.log', 'rb') as logfile:
2158 while 'dtrace_BEGIN' not in logfile.readline():
2159 if self._dtrace.poll() is not None:
2160 # Do a busy wait. :/
2162 logging.debug('dtrace started')
2164 def _get_dtrace_code(self):
2165 """Setups the D code to implement child process tracking.
2167 Injects the cookie in the script so it knows when to stop.
2169 The script will detect any instance of the script created with
2170 create_subprocess_thunk() and will start tracing it.
2173 'inline int PID = %d;\n'
2174 'inline string SCRIPT = "%s";\n'
2175 'inline int FILE_ID = %d;\n'
2179 self._signal_script,
2180 self._dummy_file_id,
2182 if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
2183 # Do not enable by default since it tends to spew dtrace: error lines
2184 # because the execve() parameters are not in valid memory at the time of
2186 # TODO(maruel): Find a way to make this reliable since it's useful but
2187 # only works in limited/trivial uses cases for now.
2188 out += self.D_CODE_EXECVE
2191 def trace(self, cmd, cwd, tracename, output):
2192 """Runs dtrace on an executable.
2194 This dtruss is broken when it starts the process itself or when tracing
2195 child processes, this code starts a wrapper process
2196 generated with create_subprocess_thunk() which starts the executable to
2199 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2200 assert os.path.isabs(cmd[0]), cmd[0]
2201 assert os.path.isabs(cwd), cwd
2202 assert os.path.normpath(cwd) == cwd, cwd
2204 if not self._initialized:
2205 raise TracingFailure(
2206 'Called Tracer.trace() on an unitialized object',
2207 None, None, None, tracename)
2208 assert tracename not in (i['trace'] for i in self._traces)
2210 # Starts the script wrapper to start the child process. This signals the
2211 # dtrace script that this process is to be traced.
2212 stdout = stderr = None
2214 stdout = subprocess.PIPE
2215 stderr = subprocess.STDOUT
2218 self._signal_script,
2221 # Call a dummy function so that dtrace knows I'm about to launch a process
2222 # that needs to be traced.
2224 child = subprocess.Popen(
2225 child_cmd + tools.fix_python_path(cmd),
2226 stdin=subprocess.PIPE,
2230 logging.debug('Started child pid: %d' % child.pid)
2232 out = child.communicate()[0]
2233 # This doesn't mean tracing is done, one of the grand-child process may
2234 # still be alive. It will be tracked with the dtrace script.
2237 assert tracename not in (i['trace'] for i in self._traces)
2238 self._traces.append(
2246 return child.returncode, out
2248 def close(self, timeout=None):
2249 """Terminates dtrace."""
2250 logging.debug('close(%s)' % timeout)
2253 super(Dtrace.Tracer, self).close(timeout)
2254 # Signal dtrace that it should stop now.
2255 # ftruncate doesn't exist on Windows.
2256 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
2260 while (self._dtrace.poll() is None and
2261 (time.time() - start) < timeout):
2266 # Make sure to kill it in any case.
2267 if self._dtrace.poll() is None:
2274 if self._dtrace.returncode != 0:
2275 # Warn about any dtrace failure but basically ignore it.
2276 print 'dtrace failure: %s' % self._dtrace.returncode
2278 os.close(self._dummy_file_id)
2279 os.remove(self._dummy_file_name)
2281 def post_process_log(self):
2282 """Sorts the log back in order when each call occured.
2284 dtrace doesn't save the buffer in strict order since it keeps one buffer
2287 super(Dtrace.Tracer, self).post_process_log()
2288 logname = self._logname + '.log'
2289 with open(logname, 'rb') as logfile:
2290 lines = [l for l in logfile if l.strip()]
2291 errors = [l for l in lines if l.startswith('dtrace:')]
2293 raise TracingFailure(
2294 'Found errors in the trace: %s' % '\n'.join(errors),
2295 None, None, None, logname)
2297 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2299 raise TracingFailure(
2300 'Found errors in the trace: %s' % '\n'.join(
2301 l for l in lines if l.split(' ', 1)[0].isdigit()),
2302 None, None, None, logname)
2303 with open(logname, 'wb') as logfile:
2304 logfile.write(''.join(lines))
2306 def __init__(self, use_sudo=None):
2307 super(Dtrace, self).__init__()
2308 self.use_sudo = use_sudo
2310 def get_tracer(self, logname):
2311 return self.Tracer(logname, self.use_sudo)
2314 def clean_trace(logname):
2315 for ext in ('', '.log'):
2316 if os.path.isfile(logname + ext):
2317 os.remove(logname + ext)
2320 def parse_log(cls, logname, blacklist, trace_name):
2321 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
2322 assert os.path.isabs(logname)
2324 def blacklist_more(filepath):
2325 # All the HFS metadata is in the form /.vol/...
2326 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2328 data = tools.read_json(logname)
2330 for item in data['traces']:
2331 if trace_name and item['trace'] != trace_name:
2334 'output': item['output'],
2335 'trace': item['trace'],
2338 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
2339 # It's fine to assume the file as UTF-8: OSX enforces the file names to
2340 # be valid UTF-8 and we control the log output.
2341 for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
2342 context.on_line(line)
2343 result['results'] = context.to_results()
2344 except TracingFailure:
2345 result['exception'] = sys.exc_info()
2350 class LogmanTrace(ApiBase):
2351 """Uses the native Windows ETW based tracing functionality to trace a child
2354 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2355 the Windows Kernel doesn't have a concept of 'current working directory' at
2356 all. A Win32 process has a map of current directories, one per drive letter
2357 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2358 opened relative to another file_object or as an absolute path. All the current
2359 working directory logic is done in user mode.
2361 class Context(ApiBase.Context):
2362 """Processes a ETW log line and keeps the list of existent and non
2363 existent files accessed.
2365 Ignores directories.
2367 # These indexes are for the stripped version in json.
2376 class Process(ApiBase.Context.Process):
2377 def __init__(self, *args):
2378 super(LogmanTrace.Context.Process, self).__init__(*args)
2379 # Handle file objects that succeeded.
2380 self.file_objects = {}
2382 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2384 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2386 super(LogmanTrace.Context, self).__init__(blacklist)
2387 self._drive_map = file_path.DosDriveMap()
2388 # Threads mapping to the corresponding process id.
2389 self._threads_active = {}
2390 # Process ID of the tracer, e.g. the temporary script created by
2391 # create_subprocess_thunk(). This is tricky because the process id may
2393 self._thunk_pid = thunk_pid
2394 self._thunk_cmd = thunk_cmd
2395 self._trace_name = trace_name
2396 self._line_number = 0
2397 self._thunk_process = None
2399 def on_line(self, line):
2400 """Processes a json Event line."""
2401 self._line_number += 1
2406 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2409 raise TracingFailure(
2410 'Unexpected event %s_%s' % (
2411 line[self.EVENT_NAME], line[self.TYPE]),
2414 except TracingFailure, e:
2415 # Hack in the values since the handler could be a static function.
2416 e.pid = line[self.PID]
2418 e.line_number = self._line_number
2419 # Re-raise the modified exception.
2421 except (KeyError, NotImplementedError, ValueError), e:
2422 raise TracingFailure(
2423 'Trace generated a %s exception: %s' % (
2424 e.__class__.__name__, str(e)),
2430 def to_results(self):
2431 if not self.root_process:
2432 raise TracingFailure(
2433 'Failed to detect the initial process %d' % self._thunk_pid,
2435 process = self.root_process.to_results_process()
2436 return Results(process)
2438 def _thread_to_process(self, tid):
2439 """Finds the process from the thread id."""
2441 pid = self._threads_active.get(tid)
2442 if not pid or not self._process_lookup.get(pid):
2444 return self._process_lookup[pid]
2447 def handle_EventTrace_Header(cls, line):
2448 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2449 BUFFER_SIZE = cls.USER_DATA
2450 #VERSION = cls.USER_DATA + 1
2451 #PROVIDER_VERSION = cls.USER_DATA + 2
2452 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2453 #END_TIME = cls.USER_DATA + 4
2454 #TIMER_RESOLUTION = cls.USER_DATA + 5
2455 #MAX_FILE_SIZE = cls.USER_DATA + 6
2456 #LOG_FILE_MODE = cls.USER_DATA + 7
2457 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2458 #START_BUFFERS = cls.USER_DATA + 9
2459 #POINTER_SIZE = cls.USER_DATA + 10
2460 EVENTS_LOST = cls.USER_DATA + 11
2461 #CPU_SPEED = cls.USER_DATA + 12
2462 #LOGGER_NAME = cls.USER_DATA + 13
2463 #LOG_FILE_NAME = cls.USER_DATA + 14
2464 #BOOT_TIME = cls.USER_DATA + 15
2465 #PERF_FREQ = cls.USER_DATA + 16
2466 #START_TIME = cls.USER_DATA + 17
2467 #RESERVED_FLAGS = cls.USER_DATA + 18
2468 #BUFFERS_LOST = cls.USER_DATA + 19
2469 #SESSION_NAME_STRING = cls.USER_DATA + 20
2470 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2471 if line[EVENTS_LOST] != '0':
2472 raise TracingFailure(
2473 ( '%s events were lost during trace, please increase the buffer '
2474 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2477 def handle_FileIo_Cleanup(self, line):
2478 """General wisdom: if a file is closed, it's because it was opened.
2480 Note that FileIo_Close is not used since if a file was opened properly but
2481 not closed before the process exits, only Cleanup will be logged.
2483 #IRP = self.USER_DATA
2484 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2485 FILE_OBJECT = self.USER_DATA + 2
2486 #FILE_KEY = self.USER_DATA + 3
2487 proc = self._thread_to_process(line[TTID])
2489 # Not a process we care about.
2491 file_object = line[FILE_OBJECT]
2492 if file_object in proc.file_objects:
2493 filepath, access_type = proc.file_objects.pop(file_object)
2494 proc.add_file(filepath, access_type)
2496 def handle_FileIo_Create(self, line):
2497 """Handles a file open.
2499 All FileIo events are described at
2500 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2501 for some value of 'description'.
2503 " (..) process and thread id values of the IO events (..) are not valid "
2504 http://msdn.microsoft.com/magazine/ee358703.aspx
2506 The FileIo.Create event doesn't return if the CreateFile() call
2507 succeeded, so keep track of the file_object and check that it is
2508 eventually closed with FileIo_Cleanup.
2510 #IRP = self.USER_DATA
2511 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2512 FILE_OBJECT = self.USER_DATA + 2
2513 #CREATE_OPTIONS = self.USER_DATA + 3
2514 #FILE_ATTRIBUTES = self.USER_DATA + 4
2515 #SHARE_ACCESS = self.USER_DATA + 5
2516 OPEN_PATH = self.USER_DATA + 6
2518 proc = self._thread_to_process(line[TTID])
2520 # Not a process we care about.
2523 raw_path = line[OPEN_PATH]
2524 # Ignore directories and bare drive right away.
2525 if raw_path.endswith(os.path.sep):
2527 filepath = self._drive_map.to_win32(raw_path)
2528 # Ignore bare drive right away. Some may still fall through with format
2530 if len(filepath) == 2:
2532 file_object = line[FILE_OBJECT]
2533 if os.path.isdir(filepath):
2534 # There is no O_DIRECTORY equivalent on Windows. The closed is
2535 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2536 # simply discard directories are they are found.
2538 # Override any stale file object.
2539 # TODO(maruel): Figure out a way to detect if the file was opened for
2540 # reading or writting. Sadly CREATE_OPTIONS doesn't seem to be of any help
2541 # here. For now mark as None to make it clear we have no idea what it is
2543 proc.file_objects[file_object] = (filepath, None)
2545 def handle_FileIo_Rename(self, line):
2546 # TODO(maruel): Handle?
2549 def handle_Process_End(self, line):
2550 pid = line[self.PID]
2551 if self._process_lookup.get(pid):
2552 logging.info('Terminated: %d' % pid)
2553 self._process_lookup[pid] = None
2555 logging.debug('Terminated: %d' % pid)
2556 if self._thunk_process and self._thunk_process.pid == pid:
2557 self._thunk_process = None
2559 def handle_Process_Start(self, line):
2560 """Handles a new child process started by PID."""
2561 #UNIQUE_PROCESS_KEY = self.USER_DATA
2562 PROCESS_ID = self.USER_DATA + 1
2563 #PARENT_PID = self.USER_DATA + 2
2564 #SESSION_ID = self.USER_DATA + 3
2565 #EXIT_STATUS = self.USER_DATA + 4
2566 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2567 #USER_SID = self.USER_DATA + 6
2568 IMAGE_FILE_NAME = self.USER_DATA + 7
2569 COMMAND_LINE = self.USER_DATA + 8
2571 ppid = line[self.PID]
2572 pid = int(line[PROCESS_ID], 16)
2573 command_line = CommandLineToArgvW(line[COMMAND_LINE])
2575 'New process %d->%d (%s) %s' %
2576 (ppid, pid, line[IMAGE_FILE_NAME], command_line))
2578 if pid == self._thunk_pid:
2579 # Need to ignore processes we don't know about because the log is
2580 # system-wide. self._thunk_pid shall start only one process.
2581 # This is tricky though because Windows *loves* to reuse process id and
2582 # it happens often that the process ID of the thunk script created by
2583 # create_subprocess_thunk() is reused. So just detecting the pid here is
2584 # not sufficient, we must confirm the command line.
2585 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2587 'Ignoring duplicate pid %d for %s: %s while searching for %s',
2588 pid, self._trace_name, command_line, self._thunk_cmd)
2591 # TODO(maruel): The check is quite weak. Add the thunk path.
2592 if self._thunk_process:
2593 raise TracingFailure(
2594 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2595 'already set') % (self._thunk_pid, self._thunk_process.pid),
2597 proc = self.Process(self.blacklist, pid, None)
2598 self._thunk_process = proc
2600 elif ppid == self._thunk_pid and self._thunk_process:
2601 proc = self.Process(self.blacklist, pid, None)
2602 self.root_process = proc
2604 elif self._process_lookup.get(ppid):
2605 proc = self.Process(self.blacklist, pid, None)
2606 self._process_lookup[ppid].children.append(proc)
2610 self._process_lookup[pid] = proc
2612 proc.command = command_line
2613 proc.executable = line[IMAGE_FILE_NAME]
2614 # proc.command[0] may be the absolute path of 'executable' but it may be
2615 # anything else too. If it happens that command[0] ends with executable,
2616 # use it, otherwise defaults to the base name.
2617 cmd0 = proc.command[0].lower()
2618 if not cmd0.endswith('.exe'):
2619 # TODO(maruel): That's not strictly true either.
2621 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2623 cmd0 = cmd0.replace('/', os.path.sep)
2624 cmd0 = os.path.normpath(cmd0)
2625 proc.executable = file_path.get_native_path_case(cmd0)
2627 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2629 def handle_Thread_End(self, line):
2630 """Has the same parameters as Thread_Start."""
2631 tid = int(line[self.TID], 16)
2632 self._threads_active.pop(tid, None)
2634 def handle_Thread_Start(self, line):
2635 """Handles a new thread created.
2637 Do not use self.PID here since a process' initial thread is created by
2640 PROCESS_ID = self.USER_DATA
2641 TTHREAD_ID = self.USER_DATA + 1
2642 #STACK_BASE = self.USER_DATA + 2
2643 #STACK_LIMIT = self.USER_DATA + 3
2644 #USER_STACK_BASE = self.USER_DATA + 4
2645 #USER_STACK_LIMIT = self.USER_DATA + 5
2646 #AFFINITY = self.USER_DATA + 6
2647 #WIN32_START_ADDR = self.USER_DATA + 7
2648 #TEB_BASE = self.USER_DATA + 8
2649 #SUB_PROCESS_TAG = self.USER_DATA + 9
2650 #BASE_PRIORITY = self.USER_DATA + 10
2651 #PAGE_PRIORITY = self.USER_DATA + 11
2652 #IO_PRIORITY = self.USER_DATA + 12
2653 #THREAD_FLAGS = self.USER_DATA + 13
2654 # Do not use self.PID here since a process' initial thread is created by
2655 # the parent process.
2656 pid = int(line[PROCESS_ID], 16)
2657 tid = int(line[TTHREAD_ID], 16)
2658 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2659 self._threads_active[tid] = pid
2662 def supported_events(cls):
2663 """Returns all the procesed events."""
2665 for member in dir(cls):
2666 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2668 out.append(match.groups())
2671 class Tracer(ApiBase.Tracer):
2672 # The basic headers.
2685 u'Processor Number',
2687 u'Parent Instance ID',
2689 u'Related Activity ID', # 15
2691 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2692 u'User(ms)', # pretty much useless.
2693 u'User Data', # Extra arguments that are event-specific.
2695 # Only the useful headers common to all entries are listed there. Any column
2696 # at 19 or higher is dependent on the specific event.
2703 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2706 class CsvReader(object):
2707 """CSV reader that reads files generated by tracerpt.exe.
2709 csv.reader() fails to read them properly, it mangles file names quoted
2710 with "" with a comma in it.
2712 # 0. Had a ',' or one of the following ' ' after a comma, next should
2713 # be ' ', '"' or string or ',' for an empty field.
2715 # 1. Processing an unquoted field up to ','.
2717 # 2. Processing a new field starting with '"'.
2718 STARTING_STR_QUOTED,
2719 # 3. Second quote in a row at the start of a field. It could be either
2720 # '""foo""' or '""'. Who the hell thought it was a great idea to use
2721 # the same character for delimiting and escaping?
2722 STARTING_SECOND_QUOTE,
2723 # 4. A quote inside a quoted string where the previous character was
2724 # not a quote, so the string is not empty. Can be either: end of a
2725 # quoted string (a delimiter) or a quote escape. The next char must be
2726 # either '"' or ','.
2727 HAD_QUOTE_IN_QUOTED,
2728 # 5. Second quote inside a quoted string.
2729 HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
2730 # 6. Processing a field that started with '"'.
2731 IN_STR_QUOTED) = range(7)
2733 def __init__(self, f):
2740 """Splits the line in fields."""
2741 line = self.f.readline()
2743 raise StopIteration()
2746 state = self.HAD_DELIMITER
2747 for i, c in enumerate(line):
2748 if state == self.HAD_DELIMITER:
2753 # Ignore initial whitespaces
2756 state = self.STARTING_STR_QUOTED
2759 # Start of a new field.
2763 elif state == self.IN_STR:
2764 # Do not accept quote inside unquoted field.
2765 assert c != '"', (i, c, line, fields)
2767 fields[-1] = fields[-1].strip()
2768 state = self.HAD_DELIMITER
2770 fields[-1] = fields[-1] + c
2772 elif state == self.STARTING_STR_QUOTED:
2774 # Do not store the character yet.
2775 state = self.STARTING_SECOND_QUOTE
2777 state = self.IN_STR_QUOTED
2778 fields[-1] = fields[-1] + c
2780 elif state == self.STARTING_SECOND_QUOTE:
2782 # It was an empty field. '""' == ''.
2783 state = self.HAD_DELIMITER
2785 fields[-1] = fields[-1] + '"' + c
2786 state = self.IN_STR_QUOTED
2788 elif state == self.HAD_QUOTE_IN_QUOTED:
2790 # End of the string.
2791 state = self.HAD_DELIMITER
2793 state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
2795 # The previous double-quote was just an unescaped quote.
2796 fields[-1] = fields[-1] + '"' + c
2797 state = self.IN_STR_QUOTED
2799 elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2801 # End of the string.
2802 state = self.HAD_DELIMITER
2803 fields[-1] = fields[-1] + '"'
2805 # That's just how the logger rolls. Revert back to appending the
2806 # char and "guess" it was a quote in a double-quoted string.
2807 state = self.IN_STR_QUOTED
2808 fields[-1] = fields[-1] + '"' + c
2810 elif state == self.IN_STR_QUOTED:
2812 # Could be a delimiter or an escape.
2813 state = self.HAD_QUOTE_IN_QUOTED
2815 fields[-1] = fields[-1] + c
2817 if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2818 fields[-1] = fields[-1] + '"'
2821 # Terminated with a normal field.
2823 # Terminated with an empty field.
2824 self.STARTING_SECOND_QUOTE,
2825 # Terminated with a normal quoted field.
2826 self.HAD_QUOTE_IN_QUOTED), (
2827 line, state, fields)
2830 def __init__(self, logname):
2831 """Starts the log collection.
2833 Requires administrative access. logman.exe is synchronous so no need for a
2834 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2835 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2837 One can get the list of potentially interesting providers with:
2838 "logman query providers | findstr /i file"
2840 super(LogmanTrace.Tracer, self).__init__(logname)
2841 self._signal_script = create_subprocess_thunk()
2842 self._scripts_to_cleanup.append(self._signal_script)
2847 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2848 # splitio,fileiocompletion,syscall,file,cswitch,img
2849 '(process,fileio,thread)',
2850 '-o', self._logname + '.etl',
2851 '-ets', # Send directly to kernel
2852 # Values extracted out of thin air.
2853 # Event Trace Session buffer size in kb.
2855 # Number of Event Trace Session buffers.
2858 logging.debug('Running: %s' % cmd_start)
2860 subprocess.check_call(
2862 stdin=subprocess.PIPE,
2863 stdout=subprocess.PIPE,
2864 stderr=subprocess.STDOUT)
2865 except subprocess.CalledProcessError, e:
2866 if e.returncode == -2147024891:
2867 print >> sys.stderr, 'Please restart with an elevated admin prompt'
2868 elif e.returncode == -2144337737:
2869 print >> sys.stderr, (
2870 'A kernel trace was already running, stop it and try again')
2873 def trace(self, cmd, cwd, tracename, output):
2874 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2875 assert os.path.isabs(cmd[0]), cmd[0]
2876 assert os.path.isabs(cwd), cwd
2877 assert os.path.normpath(cwd) == cwd, cwd
2879 if not self._initialized:
2880 raise TracingFailure(
2881 'Called Tracer.trace() on an unitialized object',
2882 None, None, None, tracename)
2883 assert tracename not in (i['trace'] for i in self._traces)
2885 # Use "logman -?" for help.
2887 stdout = stderr = None
2889 stdout = subprocess.PIPE
2890 stderr = subprocess.STDOUT
2892 # Run the child process.
2893 logging.debug('Running: %s' % cmd)
2894 # Use the temporary script generated with create_subprocess_thunk() so we
2895 # have a clear pid owner. Since trace_inputs.py can be used as a library
2896 # and could trace multiple processes simultaneously, it makes it more
2897 # complex if the executable to be traced is executed directly here. It
2898 # also solves issues related to logman.exe that needs to be executed to
2899 # control the kernel trace.
2902 self._signal_script,
2905 child = subprocess.Popen(
2906 child_cmd + tools.fix_python_path(cmd),
2908 stdin=subprocess.PIPE,
2911 logging.debug('Started child pid: %d' % child.pid)
2912 out = child.communicate()[0]
2913 # This doesn't mean all the grand-children are done. Sadly, we don't have
2914 # a good way to determine that.
2917 assert tracename not in (i['trace'] for i in self._traces)
2918 self._traces.append({
2923 # Used to figure out the real process when process ids are reused.
2924 'thunk_cmd': child_cmd,
2928 return child.returncode, out
2930 def close(self, _timeout=None):
2931 """Stops the kernel log collection and converts the traces to text
2936 super(LogmanTrace.Tracer, self).close()
2942 '-ets', # Sends the command directly to the kernel.
2944 logging.debug('Running: %s' % cmd_stop)
2945 subprocess.check_call(
2947 stdin=subprocess.PIPE,
2948 stdout=subprocess.PIPE,
2949 stderr=subprocess.STDOUT)
2951 def post_process_log(self):
2952 """Converts the .etl file into .csv then into .json."""
2953 super(LogmanTrace.Tracer, self).post_process_log()
2955 self._convert_log(logformat)
2956 self._trim_log(logformat)
2958 def _gen_logdata(self):
2961 'traces': self._traces,
2964 def _trim_log(self, logformat):
2965 """Reduces the amount of data in original log by generating a 'reduced'
2968 if logformat == 'csv_utf16':
2969 file_handle = codecs.open(
2970 self._logname + '.' + logformat, 'r', encoding='utf-16')
2972 elif logformat == 'csv':
2973 assert sys.getfilesystemencoding() == 'mbcs'
2974 file_handle = codecs.open(
2975 self._logname + '.' + logformat, 'r',
2976 encoding=get_current_encoding())
2978 supported_events = LogmanTrace.Context.supported_events()
2980 def trim(generator):
2981 """Loads items from the generator and returns the interesting data.
2983 It filters out any uninteresting line and reduce the amount of data in
2986 for index, line in enumerate(generator):
2988 if line != self.EXPECTED_HEADER:
2989 raise TracingFailure(
2990 'Found malformed header: %s' % line,
2993 # As you can see, the CSV is full of useful non-redundant information:
2994 if (line[2] != '0' or # Event ID
2995 line[3] not in ('2', '3') or # Version
2996 line[4] != '0' or # Channel
2997 line[5] != '0' or # Level
2998 line[7] != '0' or # Task
2999 line[8] != '0x0000000000000000' or # Keyword
3000 line[12] != '' or # Instance ID
3001 line[13] != '' or # Parent Instance ID
3002 line[14] != self.NULL_GUID or # Activity ID
3003 line[15] != ''): # Related Activity ID
3004 raise TracingFailure(
3005 'Found unexpected values in line: %s' % ' '.join(line),
3008 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
3012 line[self.EVENT_NAME],
3016 line[self.PROCESSOR_ID],
3017 line[self.TIMESTAMP],
3018 ] + line[self.USER_DATA:]
3020 # must not convert the trim() call into a list, since it will use too much
3021 # memory for large trace. use a csv file as a workaround since the json
3022 # parser requires a complete in-memory file.
3023 with open('%s.preprocessed' % self._logname, 'wb') as f:
3024 # $ and * can't be used in file name on windows, reducing the likelihood
3025 # of having to escape a string.
3027 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3028 for line in trim(self.CsvReader(file_handle)):
3029 out.writerow([s.encode('utf-8') for s in line])
3031 def _convert_log(self, logformat):
3032 """Converts the ETL trace to text representation.
3034 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
3035 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
3038 - logformat: Text format to be generated, csv, csv_utf16 or xml.
3040 Use "tracerpt -?" for help.
3042 LOCALE_INVARIANT = 0x7F
3043 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
3046 '-l', self._logname + '.etl',
3047 '-o', self._logname + '.' + logformat,
3050 # Use -of XML to get the header of each items after column 19, e.g. all
3051 # the actual headers of 'User Data'.
3054 if logformat == 'csv':
3055 # tracerpt localizes the 'Type' column, for major brainfuck
3056 # entertainment. I can't imagine any sane reason to do that.
3057 cmd_convert.extend(['-of', 'CSV'])
3058 elif logformat == 'csv_utf16':
3059 # This causes it to use UTF-16, which doubles the log size but ensures
3060 # the log is readable for non-ASCII characters.
3061 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
3062 elif logformat == 'xml':
3063 cmd_convert.extend(['-of', 'XML'])
3065 raise ValueError('Unexpected log format \'%s\'' % logformat)
3066 logging.debug('Running: %s' % cmd_convert)
3067 # This can takes tens of minutes for large logs.
3068 # Redirects all output to stderr.
3069 subprocess.check_call(
3071 stdin=subprocess.PIPE,
3075 def __init__(self, use_sudo=False): # pylint: disable=W0613
3076 super(LogmanTrace, self).__init__()
3077 # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.
3080 def clean_trace(logname):
3081 for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
3082 if os.path.isfile(logname + ext):
3083 os.remove(logname + ext)
3086 def parse_log(cls, logname, blacklist, trace_name):
3087 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
3088 assert os.path.isabs(logname)
3090 def blacklist_more(filepath):
3091 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3092 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3094 # Create a list of (Context, result_dict) tuples. This is necessary because
3095 # the csv file may be larger than the amount of available memory.
3099 blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
3101 'output': item['output'],
3102 'trace': item['trace'],
3105 for item in tools.read_json(logname)['traces']
3106 if not trace_name or item['trace'] == trace_name
3109 # The log may be too large to fit in memory and it is not efficient to read
3110 # it multiple times, so multiplex the contexes instead, which is slightly
3112 with open('%s.preprocessed' % logname, 'rb') as f:
3114 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3115 for encoded in lines:
3116 line = [s.decode('utf-8') for s in encoded]
3117 # Convert the PID in-place from hex.
3118 line[cls.Context.PID] = int(line[cls.Context.PID], 16)
3119 for context in contexes:
3120 if 'exception' in context[1]:
3123 context[0].on_line(line)
3124 except TracingFailure:
3125 context[1]['exception'] = sys.exc_info()
3127 for context in contexes:
3128 if 'exception' in context[1]:
3130 context[1]['results'] = context[0].to_results()
3132 return [context[1] for context in contexes]
3135 def get_api(**kwargs):
3136 """Returns the correct implementation for the current OS."""
3137 if sys.platform == 'cygwin':
3138 raise NotImplementedError(
3139 'Not implemented for cygwin, start the script from Win32 python')
3141 'win32': LogmanTrace,
3147 # Defaults to strace.
3148 return flavors.get(sys.platform, Strace)(**kwargs)
3151 def extract_directories(root_dir, files, blacklist):
3152 """Detects if all the files in a directory are in |files| and if so, replace
3153 the individual files by a Results.Directory instance.
3155 Takes a list of Results.File instances and returns a shorter list of
3156 Results.File and Results.Directory instances.
3159 - root_dir: Optional base directory that shouldn't be search further.
3160 - files: list of Results.File instances.
3161 - blacklist: lambda to reject unneeded files, for example r'.+\.pyc'.
3164 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3165 assert not (root_dir or '').endswith(os.path.sep), root_dir
3166 # It is important for root_dir to not be a symlinked path, make sure to call
3167 # os.path.realpath() as needed.
3168 assert not root_dir or (
3169 os.path.realpath(file_path.get_native_path_case(root_dir)) == root_dir)
3170 assert not any(isinstance(f, Results.Directory) for f in files)
3171 # Remove non existent files.
3172 files = [f for f in files if f.existent]
3175 # All files must share the same root, which can be None.
3176 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3178 # Creates a {directory: {filename: File}} mapping, up to root.
3181 buckets[root_dir] = {}
3182 for fileobj in files:
3183 path = fileobj.full_path
3184 directory = os.path.dirname(path)
3186 # Do not use os.path.basename() so trailing os.path.sep is kept.
3187 basename = path[len(directory)+1:]
3188 files_in_directory = buckets.setdefault(directory, {})
3189 files_in_directory[basename] = fileobj
3190 # Add all the directories recursively up to root.
3193 directory = os.path.dirname(directory)
3194 if directory + os.path.sep == root_dir or directory == old_d:
3196 buckets.setdefault(directory, {})
3198 root_prefix = len(root_dir) + 1 if root_dir else 0
3199 for directory in sorted(buckets, reverse=True):
3200 if not os.path.isdir(directory):
3202 '%s was a directory but doesn\'t exist anymore; ignoring', directory)
3204 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3205 expected = set(buckets[directory])
3206 if not (actual - expected):
3207 parent = os.path.dirname(directory)
3208 buckets[parent][os.path.basename(directory)] = Results.Directory(
3210 directory[root_prefix:],
3212 sum(f.size for f in buckets[directory].itervalues()),
3213 sum(f.nb_files for f in buckets[directory].itervalues()))
3214 # Remove the whole bucket.
3215 del buckets[directory]
3217 # Reverse the mapping with what remains. The original instances are returned,
3218 # so the cached meta data is kept.
3219 files = sum((x.values() for x in buckets.itervalues()), [])
3220 return sorted(files, key=lambda x: x.path)
3223 def trace(logfile, cmd, cwd, api, output):
3224 """Traces an executable. Returns (returncode, output) from api.
3227 - logfile: file to write to.
3228 - cmd: command to run.
3229 - cwd: current directory to start the process in.
3230 - api: a tracing api instance.
3231 - output: if True, returns output, otherwise prints it at the console.
3233 cmd = tools.fix_python_path(cmd)
3234 api.clean_trace(logfile)
3235 with api.get_tracer(logfile) as tracer:
3236 return tracer.trace(cmd, cwd, 'default', output)
3239 def CMDclean(parser, args):
3240 """Cleans up traces."""
3241 options, args = parser.parse_args(args)
3243 api.clean_trace(options.log)
3247 def CMDtrace(parser, args):
3248 """Traces an executable."""
3249 parser.allow_interspersed_args = False
3251 '-q', '--quiet', action='store_true',
3252 help='Redirects traced executable output to /dev/null')
3254 '-s', '--sudo', action='store_true',
3255 help='Use sudo when shelling out the tracer tool (ignored on Windows)')
3257 '-n', '--no-sudo', action='store_false',
3258 help='Don\'t use sudo')
3259 options, args = parser.parse_args(args)
3262 parser.error('Please provide a command to run')
3265 parser.error('Please rerun this program with admin privileges')
3267 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3268 args[0] = os.path.abspath(args[0])
3270 # options.sudo default value is None, which is to do whatever tracer defaults
3272 api = get_api(use_sudo=options.sudo)
3273 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3276 def CMDread(parser, args):
3277 """Reads the logs and prints the result."""
3283 metavar='VAR_NAME directory',
3285 help=('Variables to replace relative directories against. Example: '
3286 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3287 'home dir with $HOME') % getpass.getuser())
3290 help='Root directory to base everything off it. Anything outside of this '
3291 'this directory will not be reported')
3294 help='Only reads one of the trace. Defaults to reading all traces')
3296 '-j', '--json', action='store_true',
3297 help='Outputs raw result data as json')
3299 '--trace-blacklist', action='append', default=[],
3300 help='List of regexp to use as blacklist filter')
3301 options, args = parser.parse_args(args)
3303 if options.root_dir:
3304 options.root_dir = file_path.get_native_path_case(
3305 unicode(os.path.abspath(options.root_dir)))
3307 variables = dict(options.variables)
3309 blacklist = tools.gen_blacklist(options.trace_blacklist)
3310 data = api.parse_log(options.log, blacklist, options.trace_name)
3311 # Process each trace.
3315 if 'exception' in item:
3316 # Do not abort the other traces.
3317 print >> sys.stderr, (
3318 'Trace %s: Got an exception: %s' % (
3319 item['trace'], item['exception'][1]))
3321 results = item['results']
3322 if options.root_dir:
3323 results = results.strip_root(options.root_dir)
3326 output_as_json.append(results.flatten())
3328 simplified = extract_directories(
3329 options.root_dir, results.files, blacklist)
3330 simplified = [f.replace_variables(variables) for f in simplified]
3332 print('Trace: %s' % item['trace'])
3333 print('Total: %d' % len(results.files))
3334 print('Non existent: %d' % len(results.non_existent))
3335 for f in results.non_existent:
3336 print(' %s' % f.path)
3338 'Interesting: %d reduced to %d' % (
3339 len(results.existent), len(simplified)))
3340 for f in simplified:
3341 print(' %s' % f.path)
3344 tools.write_json(sys.stdout, output_as_json, False)
3345 except KeyboardInterrupt:
3347 except IOError as e:
3348 if e.errno == errno.EPIPE:
3349 # Do not print a stack trace when the output is piped to less and the user
3350 # quits before the whole output was written.
3356 class OptionParserTraceInputs(tools.OptionParserWithLogging):
3357 """Adds automatic --log handling."""
3359 # Disable --log-file options since both --log and --log-file options are
3361 # TODO(vadimsh): Rename --log-file or --log to something else.
3362 enable_log_file = False
3364 def __init__(self, **kwargs):
3365 tools.OptionParserWithLogging.__init__(self, **kwargs)
3367 '-l', '--log', help='Log file to generate or read, required')
3369 def parse_args(self, *args, **kwargs):
3370 """Makes sure the paths make sense.
3372 On Windows, / and \ are often mixed together in a path.
3374 options, args = tools.OptionParserWithLogging.parse_args(
3375 self, *args, **kwargs)
3377 self.error('Must supply a log file with -l')
3378 options.log = os.path.abspath(options.log)
3379 return options, args
3383 dispatcher = subcommand.CommandDispatcher(__name__)
3385 return dispatcher.execute(
3386 OptionParserTraceInputs(version=__version__), argv)
3387 except TracingFailure, e:
3388 sys.stderr.write('\nError: ')
3389 sys.stderr.write(str(e))
3390 sys.stderr.write('\n')
3394 if __name__ == '__main__':
3395 fix_encoding.fix_encoding()
3396 tools.disable_buffering()
3398 sys.exit(main(sys.argv[1:]))