4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
61 #include "sane_ctype.h"
64 # define O_CLOEXEC 02000000
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE 1024
72 struct perf_tool tool;
73 struct syscalltbl *sctbl;
76 struct syscall *table;
78 struct perf_evsel *sys_enter,
82 struct record_opts opts;
83 struct perf_evlist *evlist;
85 struct thread *current;
88 unsigned long nr_events;
89 struct strlist *ev_qualifier;
98 double duration_filter;
104 unsigned int max_stack;
105 unsigned int min_stack;
106 bool not_ev_qualifier;
110 bool multiple_threads;
114 bool show_tool_stats;
116 bool kernel_syscallchains;
126 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
135 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
148 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 return bswap_##bits(value);\
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
156 static int tp_field__init_uint(struct tp_field *field,
157 struct format_field *format_field,
160 field->offset = format_field->offset;
162 switch (format_field->size) {
164 field->integer = tp_field__u8;
167 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
184 return sample->raw_data + field->offset;
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
189 field->offset = format_field->offset;
190 field->pointer = tp_field__ptr;
197 struct tp_field args, ret;
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 struct tp_field *field,
205 struct format_field *format_field = perf_evsel__field(evsel, name);
207 if (format_field == NULL)
210 return tp_field__init_uint(field, format_field, evsel->needs_swap);
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 ({ struct syscall_tp *sc = evsel->priv;\
215 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 struct tp_field *field,
221 struct format_field *format_field = perf_evsel__field(evsel, name);
223 if (format_field == NULL)
226 return tp_field__init_ptr(field, format_field);
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 ({ struct syscall_tp *sc = evsel->priv;\
231 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
236 perf_evsel__delete(evsel);
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
241 evsel->priv = malloc(sizeof(struct syscall_tp));
242 if (evsel->priv != NULL) {
243 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
246 evsel->handler = handler;
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
259 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
261 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
263 evsel = perf_evsel__newtp("syscalls", direction);
268 if (perf_evsel__init_syscall_tp(evsel, handler))
274 perf_evsel__delete_priv(evsel);
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 ({ struct syscall_tp *fields = evsel->priv; \
280 fields->name.integer(&fields->name, sample); })
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 ({ struct syscall_tp *fields = evsel->priv; \
284 fields->name.pointer(&fields->name, sample); })
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
288 int idx = val - sa->offset;
290 if (idx < 0 || idx >= sa->nr_entries)
291 return scnprintf(bf, size, intfmt, val);
293 return scnprintf(bf, size, "%s", sa->entries[idx]);
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 struct syscall_arg *arg)
300 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 struct syscall_arg *arg)
306 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
313 struct strarray **entries;
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 .nr_entries = ARRAY_SIZE(array), \
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 struct syscall_arg *arg)
324 struct strarrays *sas = arg->parm;
327 for (i = 0; i < sas->nr_entries; ++i) {
328 struct strarray *sa = sas->entries[i];
329 int idx = arg->val - sa->offset;
331 if (idx >= 0 && idx < sa->nr_entries) {
332 if (sa->entries[idx] == NULL)
334 return scnprintf(bf, size, "%s", sa->entries[idx]);
338 return scnprintf(bf, size, "%d", arg->val);
342 #define AT_FDCWD -100
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 struct syscall_arg *arg)
351 return scnprintf(bf, size, "CWD");
353 return syscall_arg__scnprintf_fd(bf, size, arg);
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 struct syscall_arg *arg);
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
365 return scnprintf(bf, size, "%#lx", arg->val);
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
370 return scnprintf(bf, size, "%d", arg->val);
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
375 return scnprintf(bf, size, "%ld", arg->val);
378 static const char *bpf_cmd[] = {
379 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 "MAP_GET_NEXT_KEY", "PROG_LOAD",
382 static DEFINE_STRARRAY(bpf_cmd);
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
390 static const char *keyctl_options[] = {
391 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
397 static DEFINE_STRARRAY(keyctl_options);
399 static const char *whences[] = { "SET", "CUR", "END",
407 static DEFINE_STRARRAY(whences);
409 static const char *fcntl_cmds[] = {
410 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
415 static DEFINE_STRARRAY(fcntl_cmds);
417 static const char *fcntl_linux_specific_cmds[] = {
418 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
419 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
425 static struct strarray *fcntl_cmds_arrays[] = {
426 &strarray__fcntl_cmds,
427 &strarray__fcntl_linux_specific_cmds,
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
432 static const char *rlimit_resources[] = {
433 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
437 static DEFINE_STRARRAY(rlimit_resources);
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
442 static const char *clockid[] = {
443 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
447 static DEFINE_STRARRAY(clockid);
449 static const char *socket_families[] = {
450 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 "ALG", "NFC", "VSOCK",
457 static DEFINE_STRARRAY(socket_families);
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 struct syscall_arg *arg)
465 if (mode == F_OK) /* 0 */
466 return scnprintf(bf, size, "F");
468 if (mode & n##_OK) { \
469 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
479 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 struct syscall_arg *arg);
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 struct syscall_arg *arg)
494 int printed = 0, flags = arg->val;
497 if (flags & O_##n) { \
498 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
507 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK 0x0001
518 #define GRND_RANDOM 0x0002
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 struct syscall_arg *arg)
524 int printed = 0, flags = arg->val;
527 if (flags & GRND_##n) { \
528 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 flags &= ~GRND_##n; \
537 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544 #define STRARRAY(name, array) \
545 { .scnprintf = SCA_STRARRAY, \
546 .parm = &strarray__##array, }
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
563 struct syscall_arg_fmt {
564 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570 static struct syscall_fmt {
573 struct syscall_arg_fmt arg[6];
580 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
581 { .name = "arch_prctl", .alias = "prctl", },
583 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 { .name = "brk", .hexret = true,
585 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 { .name = "clock_gettime",
587 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 { .name = "clone", .errpid = true, .nr_args = 5,
589 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
590 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
591 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
593 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
595 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 { .name = "epoll_ctl",
597 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 { .name = "eventfd2",
599 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 { .name = "fchmodat",
601 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 { .name = "fchownat",
603 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 .parm = &strarrays__fcntl_cmds_arrays,
607 .show_zero = true, },
608 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
610 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 { .name = "fstat", .alias = "newfstat", },
612 { .name = "fstatat", .alias = "newfstatat", },
614 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 { .name = "futimesat",
616 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 { .name = "getitimer",
618 .arg = { [0] = STRARRAY(which, itimers), }, },
619 { .name = "getpid", .errpid = true, },
620 { .name = "getpgid", .errpid = true, },
621 { .name = "getppid", .errpid = true, },
622 { .name = "getrandom",
623 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 { .name = "getrlimit",
625 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
628 #if defined(__i386__) || defined(__x86_64__)
630 * FIXME: Make this available to all arches.
632 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
635 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
638 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
640 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
642 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
644 .arg = { [2] = STRARRAY(whence, whences), }, },
645 { .name = "lstat", .alias = "newlstat", },
647 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
648 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
650 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
652 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 { .name = "mlockall",
656 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 { .name = "mmap", .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
662 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
663 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
664 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
665 { .name = "mprotect",
666 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
667 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
668 { .name = "mq_unlink",
669 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 { .name = "mremap", .hexret = true,
671 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
672 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
675 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
677 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 { .name = "name_to_handle_at",
679 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 { .name = "newfstatat",
681 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
683 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 { .name = "open_by_handle_at",
685 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
686 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
688 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
689 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 { .name = "perf_event_open",
691 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
692 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
693 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
695 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 { .name = "pkey_alloc",
697 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
698 { .name = "pkey_free",
699 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
700 { .name = "pkey_mprotect",
701 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
702 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
703 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
704 { .name = "poll", .timeout = true, },
705 { .name = "ppoll", .timeout = true, },
706 { .name = "pread", .alias = "pread64", },
707 { .name = "preadv", .alias = "pread", },
708 { .name = "prlimit64",
709 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 { .name = "pwrite", .alias = "pwrite64", },
711 { .name = "readlinkat",
712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 { .name = "recvfrom",
714 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 { .name = "recvmmsg",
716 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
718 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 { .name = "renameat",
720 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 { .name = "rt_sigaction",
722 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 { .name = "rt_sigprocmask",
724 .arg = { [0] = STRARRAY(how, sighow), }, },
725 { .name = "rt_sigqueueinfo",
726 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 { .name = "rt_tgsigqueueinfo",
728 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 { .name = "sched_setscheduler",
730 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
732 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
733 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 { .name = "select", .timeout = true, },
735 { .name = "sendmmsg",
736 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
738 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
740 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 { .name = "set_tid_address", .errpid = true, },
742 { .name = "setitimer",
743 .arg = { [0] = STRARRAY(which, itimers), }, },
744 { .name = "setrlimit",
745 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
747 .arg = { [0] = STRARRAY(family, socket_families),
748 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 { .name = "socketpair",
750 .arg = { [0] = STRARRAY(family, socket_families),
751 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 { .name = "stat", .alias = "newstat", },
754 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
755 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
758 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
760 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 { .name = "symlinkat",
762 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
764 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
766 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 { .name = "uname", .alias = "newuname", },
768 { .name = "unlinkat",
769 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 { .name = "utimensat",
771 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 { .name = "wait4", .errpid = true,
773 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 { .name = "waitid", .errpid = true,
775 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
780 const struct syscall_fmt *fmt = fmtp;
781 return strcmp(name, fmt->name);
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
786 const int nmemb = ARRAY_SIZE(syscall_fmts);
787 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
791 struct event_format *tp_format;
793 struct format_field *args;
796 struct syscall_fmt *fmt;
797 struct syscall_arg_fmt *arg_fmt;
801 * We need to have this 'calculated' boolean because in some cases we really
802 * don't know what is the duration of a syscall, for instance, when we start
803 * a session and some threads are waiting for a syscall to finish, say 'poll',
804 * in which case all we can do is to print "( ? ) for duration and for the
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
809 double duration = (double)t / NSEC_PER_MSEC;
810 size_t printed = fprintf(fp, "(");
813 printed += fprintf(fp, " ? ");
814 else if (duration >= 1.0)
815 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 else if (duration >= 0.01)
817 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
819 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 return printed + fprintf(fp, "): ");
824 * filename.ptr: The filename char pointer that will be vfs_getname'd
825 * filename.entry_str_pos: Where to insert the string translated from
826 * filename.ptr by the vfs_getname tracepoint/kprobe.
827 * ret_scnprintf: syscall args may set this to a different syscall return
828 * formatter, for instance, fcntl may return fds, file flags, etc.
830 struct thread_trace {
833 unsigned long nr_events;
834 unsigned long pfmaj, pfmin;
837 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
840 short int entry_str_pos;
842 unsigned int namelen;
850 struct intlist *syscall_stats;
853 static struct thread_trace *thread_trace__new(void)
855 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
858 ttrace->paths.max = -1;
860 ttrace->syscall_stats = intlist__new(NULL);
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
867 struct thread_trace *ttrace;
872 if (thread__priv(thread) == NULL)
873 thread__set_priv(thread, thread_trace__new());
875 if (thread__priv(thread) == NULL)
878 ttrace = thread__priv(thread);
883 color_fprintf(fp, PERF_COLOR_RED,
884 "WARNING: not enough memory, dropping samples!\n");
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
892 struct thread_trace *ttrace = thread__priv(arg->thread);
894 ttrace->ret_scnprintf = ret_scnprintf;
897 #define TRACE_PFMAJ (1 << 0)
898 #define TRACE_PFMIN (1 << 1)
900 static const size_t trace__entry_str_size = 2048;
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
904 struct thread_trace *ttrace = thread__priv(thread);
906 if (fd > ttrace->paths.max) {
907 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
912 if (ttrace->paths.max != -1) {
913 memset(npath + ttrace->paths.max + 1, 0,
914 (fd - ttrace->paths.max) * sizeof(char *));
916 memset(npath, 0, (fd + 1) * sizeof(char *));
919 ttrace->paths.table = npath;
920 ttrace->paths.max = fd;
923 ttrace->paths.table[fd] = strdup(pathname);
925 return ttrace->paths.table[fd] != NULL ? 0 : -1;
928 static int thread__read_fd_path(struct thread *thread, int fd)
930 char linkname[PATH_MAX], pathname[PATH_MAX];
934 if (thread->pid_ == thread->tid) {
935 scnprintf(linkname, sizeof(linkname),
936 "/proc/%d/fd/%d", thread->pid_, fd);
938 scnprintf(linkname, sizeof(linkname),
939 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
942 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
945 ret = readlink(linkname, pathname, sizeof(pathname));
947 if (ret < 0 || ret > st.st_size)
950 pathname[ret] = '\0';
951 return trace__set_fd_pathname(thread, fd, pathname);
954 static const char *thread__fd_path(struct thread *thread, int fd,
957 struct thread_trace *ttrace = thread__priv(thread);
965 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
968 ++trace->stats.proc_getname;
969 if (thread__read_fd_path(thread, fd))
973 return ttrace->paths.table[fd];
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
979 size_t printed = scnprintf(bf, size, "%d", fd);
980 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
983 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 struct syscall_arg *arg)
992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 struct thread_trace *ttrace = thread__priv(arg->thread);
995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 zfree(&ttrace->paths.table[fd]);
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1004 struct thread_trace *ttrace = thread__priv(thread);
1006 ttrace->filename.ptr = ptr;
1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 struct syscall_arg *arg)
1013 unsigned long ptr = arg->val;
1015 if (!arg->trace->vfs_getname)
1016 return scnprintf(bf, size, "%#x", ptr);
1018 thread__set_filename_pos(arg->thread, bf, ptr);
1022 static bool trace__filter_duration(struct trace *trace, double t)
1024 return t < (trace->duration_filter * NSEC_PER_MSEC);
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1031 return fprintf(fp, "%10.3f ", ts);
1035 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036 * using ttrace->entry_time for a thread that receives a sys_exit without
1037 * first having received a sys_enter ("poll" issued before tracing session
1038 * starts, lost sys_enter exit due to ring buffer overflow).
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1043 return __trace__fprintf_tstamp(trace, tstamp, fp);
1045 return fprintf(fp, " ? ");
1048 static bool done = false;
1049 static bool interrupted = false;
1051 static void sig_handler(int sig)
1054 interrupted = sig == SIGINT;
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1060 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 printed += fprintf_duration(duration, duration_calculated, fp);
1063 if (trace->multiple_threads) {
1064 if (trace->show_comm)
1065 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 printed += fprintf(fp, "%d ", thread->tid);
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 union perf_event *event, struct perf_sample *sample)
1077 switch (event->header.type) {
1078 case PERF_RECORD_LOST:
1079 color_fprintf(trace->output, PERF_COLOR_RED,
1080 "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 ret = machine__process_lost_event(machine, event, sample);
1084 ret = machine__process_event(machine, event, sample);
1091 static int trace__tool_process(struct perf_tool *tool,
1092 union perf_event *event,
1093 struct perf_sample *sample,
1094 struct machine *machine)
1096 struct trace *trace = container_of(tool, struct trace, tool);
1097 return trace__process_event(trace, machine, event, sample);
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1102 struct machine *machine = vmachine;
1104 if (machine->kptr_restrict_warned)
1107 if (symbol_conf.kptr_restrict) {
1108 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 "Kernel samples will not be resolved.\n");
1111 machine->kptr_restrict_warned = true;
1115 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1120 int err = symbol__init(NULL);
1125 trace->host = machine__new_host();
1126 if (trace->host == NULL)
1129 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1132 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 evlist->threads, trace__tool_process, false,
1134 trace->opts.proc_map_timeout);
1141 static void trace__symbols__exit(struct trace *trace)
1143 machine__exit(trace->host);
1149 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1153 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1154 nr_args = sc->fmt->nr_args;
1156 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1157 if (sc->arg_fmt == NULL)
1160 for (idx = 0; idx < nr_args; ++idx) {
1162 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1165 sc->nr_args = nr_args;
1169 static int syscall__set_arg_fmts(struct syscall *sc)
1171 struct format_field *field;
1174 for (field = sc->args; field; field = field->next, ++idx) {
1175 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1178 if (strcmp(field->type, "const char *") == 0 &&
1179 (strcmp(field->name, "filename") == 0 ||
1180 strcmp(field->name, "path") == 0 ||
1181 strcmp(field->name, "pathname") == 0))
1182 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1183 else if (field->flags & FIELD_IS_POINTER)
1184 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1185 else if (strcmp(field->type, "pid_t") == 0)
1186 sc->arg_fmt[idx].scnprintf = SCA_PID;
1187 else if (strcmp(field->type, "umode_t") == 0)
1188 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1189 else if ((strcmp(field->type, "int") == 0 ||
1190 strcmp(field->type, "unsigned int") == 0 ||
1191 strcmp(field->type, "long") == 0) &&
1192 (len = strlen(field->name)) >= 2 &&
1193 strcmp(field->name + len - 2, "fd") == 0) {
1195 * /sys/kernel/tracing/events/syscalls/sys_enter*
1196 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1201 sc->arg_fmt[idx].scnprintf = SCA_FD;
1208 static int trace__read_syscall_info(struct trace *trace, int id)
1212 const char *name = syscalltbl__name(trace->sctbl, id);
1217 if (id > trace->syscalls.max) {
1218 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1220 if (nsyscalls == NULL)
1223 if (trace->syscalls.max != -1) {
1224 memset(nsyscalls + trace->syscalls.max + 1, 0,
1225 (id - trace->syscalls.max) * sizeof(*sc));
1227 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1230 trace->syscalls.table = nsyscalls;
1231 trace->syscalls.max = id;
1234 sc = trace->syscalls.table + id;
1237 sc->fmt = syscall_fmt__find(sc->name);
1239 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1240 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1242 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1243 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1244 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1247 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1250 if (IS_ERR(sc->tp_format))
1253 sc->args = sc->tp_format->format.fields;
1255 * We need to check and discard the first variable '__syscall_nr'
1256 * or 'nr' that mean the syscall number. It is needless here.
1257 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1259 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1260 sc->args = sc->args->next;
1264 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1266 return syscall__set_arg_fmts(sc);
1269 static int trace__validate_ev_qualifier(struct trace *trace)
1272 size_t nr_allocated;
1273 struct str_node *pos;
1275 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277 sizeof(trace->ev_qualifier_ids.entries[0]));
1279 if (trace->ev_qualifier_ids.entries == NULL) {
1280 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1286 nr_allocated = trace->ev_qualifier_ids.nr;
1289 strlist__for_each_entry(pos, trace->ev_qualifier) {
1290 const char *sc = pos->s;
1291 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1294 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1299 fputs("Error:\tInvalid syscall ", trace->output);
1302 fputs(", ", trace->output);
1305 fputs(sc, trace->output);
1308 trace->ev_qualifier_ids.entries[i++] = id;
1309 if (match_next == -1)
1313 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1316 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1320 entries = realloc(trace->ev_qualifier_ids.entries,
1321 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1322 if (entries == NULL) {
1324 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1327 trace->ev_qualifier_ids.entries = entries;
1329 trace->ev_qualifier_ids.nr++;
1330 trace->ev_qualifier_ids.entries[i++] = id;
1335 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1336 "\nHint:\tand: 'man syscalls'\n", trace->output);
1338 zfree(&trace->ev_qualifier_ids.entries);
1339 trace->ev_qualifier_ids.nr = 0;
1346 * args is to be interpreted as a series of longs but we need to handle
1347 * 8-byte unaligned accesses. args points to raw_data within the event
1348 * and raw_data is guaranteed to be 8-byte unaligned because it is
1349 * preceded by raw_size which is a u32. So we need to copy args to a temp
1350 * variable to read it. Most notably this avoids extended load instructions
1351 * on unaligned addresses
1353 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1356 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1358 memcpy(&val, p, sizeof(val));
1362 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1363 struct syscall_arg *arg)
1365 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1366 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1368 return scnprintf(bf, size, "arg%d: ", arg->idx);
1371 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1372 struct syscall_arg *arg, unsigned long val)
1374 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1376 if (sc->arg_fmt[arg->idx].parm)
1377 arg->parm = sc->arg_fmt[arg->idx].parm;
1378 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1380 return scnprintf(bf, size, "%ld", val);
1383 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1384 unsigned char *args, struct trace *trace,
1385 struct thread *thread)
1390 struct syscall_arg arg = {
1397 struct thread_trace *ttrace = thread__priv(thread);
1400 * Things like fcntl will set this in its 'cmd' formatter to pick the
1401 * right formatter for the return value (an fd? file flags?), which is
1402 * not needed for syscalls that always return a given type, say an fd.
1404 ttrace->ret_scnprintf = NULL;
1406 if (sc->args != NULL) {
1407 struct format_field *field;
1409 for (field = sc->args; field;
1410 field = field->next, ++arg.idx, bit <<= 1) {
1414 val = syscall_arg__val(&arg, arg.idx);
1417 * Suppress this argument if its value is zero and
1418 * and we don't have a string associated in an
1423 (sc->arg_fmt[arg.idx].show_zero ||
1424 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1425 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1426 sc->arg_fmt[arg.idx].parm))
1429 printed += scnprintf(bf + printed, size - printed,
1430 "%s%s: ", printed ? ", " : "", field->name);
1431 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1433 } else if (IS_ERR(sc->tp_format)) {
1435 * If we managed to read the tracepoint /format file, then we
1436 * may end up not having any args, like with gettid(), so only
1437 * print the raw args when we didn't manage to read it.
1439 while (arg.idx < sc->nr_args) {
1442 val = syscall_arg__val(&arg, arg.idx);
1444 printed += scnprintf(bf + printed, size - printed, ", ");
1445 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1446 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1456 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1457 union perf_event *event,
1458 struct perf_sample *sample);
1460 static struct syscall *trace__syscall_info(struct trace *trace,
1461 struct perf_evsel *evsel, int id)
1467 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1468 * before that, leaving at a higher verbosity level till that is
1469 * explained. Reproduced with plain ftrace with:
1471 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1472 * grep "NR -1 " /t/trace_pipe
1474 * After generating some load on the machine.
1478 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1479 id, perf_evsel__name(evsel), ++n);
1484 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1485 trace__read_syscall_info(trace, id))
1488 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1491 return &trace->syscalls.table[id];
1495 fprintf(trace->output, "Problems reading syscall %d", id);
1496 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1497 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1498 fputs(" information\n", trace->output);
1503 static void thread__update_stats(struct thread_trace *ttrace,
1504 int id, struct perf_sample *sample)
1506 struct int_node *inode;
1507 struct stats *stats;
1510 inode = intlist__findnew(ttrace->syscall_stats, id);
1514 stats = inode->priv;
1515 if (stats == NULL) {
1516 stats = malloc(sizeof(struct stats));
1520 inode->priv = stats;
1523 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1524 duration = sample->time - ttrace->entry_time;
1526 update_stats(stats, duration);
1529 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1531 struct thread_trace *ttrace;
1535 if (trace->current == NULL)
1538 ttrace = thread__priv(trace->current);
1540 if (!ttrace->entry_pending)
1543 duration = sample->time - ttrace->entry_time;
1545 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1546 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1547 ttrace->entry_pending = false;
1552 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1553 union perf_event *event __maybe_unused,
1554 struct perf_sample *sample)
1559 struct thread *thread;
1560 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1561 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1562 struct thread_trace *ttrace;
1567 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1568 ttrace = thread__trace(thread, trace->output);
1572 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1574 if (ttrace->entry_str == NULL) {
1575 ttrace->entry_str = malloc(trace__entry_str_size);
1576 if (!ttrace->entry_str)
1580 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1581 trace__printf_interrupted_entry(trace, sample);
1583 ttrace->entry_time = sample->time;
1584 msg = ttrace->entry_str;
1585 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1587 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1588 args, trace, thread);
1591 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1592 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1593 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1596 ttrace->entry_pending = true;
1597 /* See trace__vfs_getname & trace__sys_exit */
1598 ttrace->filename.pending_open = false;
1601 if (trace->current != thread) {
1602 thread__put(trace->current);
1603 trace->current = thread__get(thread);
1607 thread__put(thread);
1611 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1612 struct perf_sample *sample,
1613 struct callchain_cursor *cursor)
1615 struct addr_location al;
1617 if (machine__resolve(trace->host, &al, sample) < 0 ||
1618 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1624 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1626 /* TODO: user-configurable print_opts */
1627 const unsigned int print_opts = EVSEL__PRINT_SYM |
1629 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1631 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1634 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1635 union perf_event *event __maybe_unused,
1636 struct perf_sample *sample)
1640 bool duration_calculated = false;
1641 struct thread *thread;
1642 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1643 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1644 struct thread_trace *ttrace;
1649 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1650 ttrace = thread__trace(thread, trace->output);
1655 thread__update_stats(ttrace, id, sample);
1657 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1659 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1660 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1661 ttrace->filename.pending_open = false;
1662 ++trace->stats.vfs_getname;
1665 if (ttrace->entry_time) {
1666 duration = sample->time - ttrace->entry_time;
1667 if (trace__filter_duration(trace, duration))
1669 duration_calculated = true;
1670 } else if (trace->duration_filter)
1673 if (sample->callchain) {
1674 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1675 if (callchain_ret == 0) {
1676 if (callchain_cursor.nr < trace->min_stack)
1682 if (trace->summary_only)
1685 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1687 if (ttrace->entry_pending) {
1688 fprintf(trace->output, "%-70s", ttrace->entry_str);
1690 fprintf(trace->output, " ... [");
1691 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1692 fprintf(trace->output, "]: %s()", sc->name);
1695 if (sc->fmt == NULL) {
1699 fprintf(trace->output, ") = %ld", ret);
1700 } else if (ret < 0) {
1702 char bf[STRERR_BUFSIZE];
1703 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1704 *e = audit_errno_to_name(-ret);
1706 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1708 } else if (ret == 0 && sc->fmt->timeout)
1709 fprintf(trace->output, ") = 0 Timeout");
1710 else if (ttrace->ret_scnprintf) {
1712 struct syscall_arg arg = {
1717 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1718 ttrace->ret_scnprintf = NULL;
1719 fprintf(trace->output, ") = %s", bf);
1720 } else if (sc->fmt->hexret)
1721 fprintf(trace->output, ") = %#lx", ret);
1722 else if (sc->fmt->errpid) {
1723 struct thread *child = machine__find_thread(trace->host, ret, ret);
1725 if (child != NULL) {
1726 fprintf(trace->output, ") = %ld", ret);
1727 if (child->comm_set)
1728 fprintf(trace->output, " (%s)", thread__comm_str(child));
1734 fputc('\n', trace->output);
1736 if (callchain_ret > 0)
1737 trace__fprintf_callchain(trace, sample);
1738 else if (callchain_ret < 0)
1739 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1741 ttrace->entry_pending = false;
1744 thread__put(thread);
1748 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1749 union perf_event *event __maybe_unused,
1750 struct perf_sample *sample)
1752 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1753 struct thread_trace *ttrace;
1754 size_t filename_len, entry_str_len, to_move;
1755 ssize_t remaining_space;
1757 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1762 ttrace = thread__priv(thread);
1766 filename_len = strlen(filename);
1767 if (filename_len == 0)
1770 if (ttrace->filename.namelen < filename_len) {
1771 char *f = realloc(ttrace->filename.name, filename_len + 1);
1776 ttrace->filename.namelen = filename_len;
1777 ttrace->filename.name = f;
1780 strcpy(ttrace->filename.name, filename);
1781 ttrace->filename.pending_open = true;
1783 if (!ttrace->filename.ptr)
1786 entry_str_len = strlen(ttrace->entry_str);
1787 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1788 if (remaining_space <= 0)
1791 if (filename_len > (size_t)remaining_space) {
1792 filename += filename_len - remaining_space;
1793 filename_len = remaining_space;
1796 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1797 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1798 memmove(pos + filename_len, pos, to_move);
1799 memcpy(pos, filename, filename_len);
1801 ttrace->filename.ptr = 0;
1802 ttrace->filename.entry_str_pos = 0;
1804 thread__put(thread);
1809 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1810 union perf_event *event __maybe_unused,
1811 struct perf_sample *sample)
1813 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1814 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1815 struct thread *thread = machine__findnew_thread(trace->host,
1818 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1823 ttrace->runtime_ms += runtime_ms;
1824 trace->runtime_ms += runtime_ms;
1826 thread__put(thread);
1830 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1832 perf_evsel__strval(evsel, sample, "comm"),
1833 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1835 perf_evsel__intval(evsel, sample, "vruntime"));
1839 static void bpf_output__printer(enum binary_printer_ops op,
1840 unsigned int val, void *extra)
1842 FILE *output = extra;
1843 unsigned char ch = (unsigned char)val;
1846 case BINARY_PRINT_CHAR_DATA:
1847 fprintf(output, "%c", isprint(ch) ? ch : '.');
1849 case BINARY_PRINT_DATA_BEGIN:
1850 case BINARY_PRINT_LINE_BEGIN:
1851 case BINARY_PRINT_ADDR:
1852 case BINARY_PRINT_NUM_DATA:
1853 case BINARY_PRINT_NUM_PAD:
1854 case BINARY_PRINT_SEP:
1855 case BINARY_PRINT_CHAR_PAD:
1856 case BINARY_PRINT_LINE_END:
1857 case BINARY_PRINT_DATA_END:
1863 static void bpf_output__fprintf(struct trace *trace,
1864 struct perf_sample *sample)
1866 print_binary(sample->raw_data, sample->raw_size, 8,
1867 bpf_output__printer, trace->output);
1870 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1871 union perf_event *event __maybe_unused,
1872 struct perf_sample *sample)
1874 int callchain_ret = 0;
1876 if (sample->callchain) {
1877 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1878 if (callchain_ret == 0) {
1879 if (callchain_cursor.nr < trace->min_stack)
1885 trace__printf_interrupted_entry(trace, sample);
1886 trace__fprintf_tstamp(trace, sample->time, trace->output);
1888 if (trace->trace_syscalls)
1889 fprintf(trace->output, "( ): ");
1891 fprintf(trace->output, "%s:", evsel->name);
1893 if (perf_evsel__is_bpf_output(evsel)) {
1894 bpf_output__fprintf(trace, sample);
1895 } else if (evsel->tp_format) {
1896 event_format__fprintf(evsel->tp_format, sample->cpu,
1897 sample->raw_data, sample->raw_size,
1901 fprintf(trace->output, ")\n");
1903 if (callchain_ret > 0)
1904 trace__fprintf_callchain(trace, sample);
1905 else if (callchain_ret < 0)
1906 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1911 static void print_location(FILE *f, struct perf_sample *sample,
1912 struct addr_location *al,
1913 bool print_dso, bool print_sym)
1916 if ((verbose > 0 || print_dso) && al->map)
1917 fprintf(f, "%s@", al->map->dso->long_name);
1919 if ((verbose > 0 || print_sym) && al->sym)
1920 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1921 al->addr - al->sym->start);
1923 fprintf(f, "0x%" PRIx64, al->addr);
1925 fprintf(f, "0x%" PRIx64, sample->addr);
1928 static int trace__pgfault(struct trace *trace,
1929 struct perf_evsel *evsel,
1930 union perf_event *event __maybe_unused,
1931 struct perf_sample *sample)
1933 struct thread *thread;
1934 struct addr_location al;
1935 char map_type = 'd';
1936 struct thread_trace *ttrace;
1938 int callchain_ret = 0;
1940 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1942 if (sample->callchain) {
1943 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944 if (callchain_ret == 0) {
1945 if (callchain_cursor.nr < trace->min_stack)
1951 ttrace = thread__trace(thread, trace->output);
1955 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1960 if (trace->summary_only)
1963 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1966 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1968 fprintf(trace->output, "%sfault [",
1969 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1972 print_location(trace->output, sample, &al, false, true);
1974 fprintf(trace->output, "] => ");
1976 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1980 thread__find_addr_location(thread, sample->cpumode,
1981 MAP__FUNCTION, sample->addr, &al);
1989 print_location(trace->output, sample, &al, true, false);
1991 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1993 if (callchain_ret > 0)
1994 trace__fprintf_callchain(trace, sample);
1995 else if (callchain_ret < 0)
1996 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2000 thread__put(thread);
2004 static void trace__set_base_time(struct trace *trace,
2005 struct perf_evsel *evsel,
2006 struct perf_sample *sample)
2009 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2010 * and don't use sample->time unconditionally, we may end up having
2011 * some other event in the future without PERF_SAMPLE_TIME for good
2012 * reason, i.e. we may not be interested in its timestamps, just in
2013 * it taking place, picking some piece of information when it
2014 * appears in our event stream (vfs_getname comes to mind).
2016 if (trace->base_time == 0 && !trace->full_time &&
2017 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2018 trace->base_time = sample->time;
2021 static int trace__process_sample(struct perf_tool *tool,
2022 union perf_event *event,
2023 struct perf_sample *sample,
2024 struct perf_evsel *evsel,
2025 struct machine *machine __maybe_unused)
2027 struct trace *trace = container_of(tool, struct trace, tool);
2028 struct thread *thread;
2031 tracepoint_handler handler = evsel->handler;
2033 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2034 if (thread && thread__is_filtered(thread))
2037 trace__set_base_time(trace, evsel, sample);
2041 handler(trace, evsel, event, sample);
2044 thread__put(thread);
2048 static int trace__record(struct trace *trace, int argc, const char **argv)
2050 unsigned int rec_argc, i, j;
2051 const char **rec_argv;
2052 const char * const record_args[] = {
2059 const char * const sc_args[] = { "-e", };
2060 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2061 const char * const majpf_args[] = { "-e", "major-faults" };
2062 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2063 const char * const minpf_args[] = { "-e", "minor-faults" };
2064 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2066 /* +1 is for the event string below */
2067 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2068 majpf_args_nr + minpf_args_nr + argc;
2069 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2071 if (rec_argv == NULL)
2075 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2076 rec_argv[j++] = record_args[i];
2078 if (trace->trace_syscalls) {
2079 for (i = 0; i < sc_args_nr; i++)
2080 rec_argv[j++] = sc_args[i];
2082 /* event string may be different for older kernels - e.g., RHEL6 */
2083 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2084 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2085 else if (is_valid_tracepoint("syscalls:sys_enter"))
2086 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2088 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2094 if (trace->trace_pgfaults & TRACE_PFMAJ)
2095 for (i = 0; i < majpf_args_nr; i++)
2096 rec_argv[j++] = majpf_args[i];
2098 if (trace->trace_pgfaults & TRACE_PFMIN)
2099 for (i = 0; i < minpf_args_nr; i++)
2100 rec_argv[j++] = minpf_args[i];
2102 for (i = 0; i < (unsigned int)argc; i++)
2103 rec_argv[j++] = argv[i];
2105 return cmd_record(j, rec_argv);
2108 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2110 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2112 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2117 if (perf_evsel__field(evsel, "pathname") == NULL) {
2118 perf_evsel__delete(evsel);
2122 evsel->handler = trace__vfs_getname;
2123 perf_evlist__add(evlist, evsel);
2127 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2129 struct perf_evsel *evsel;
2130 struct perf_event_attr attr = {
2131 .type = PERF_TYPE_SOFTWARE,
2135 attr.config = config;
2136 attr.sample_period = 1;
2138 event_attr_init(&attr);
2140 evsel = perf_evsel__new(&attr);
2142 evsel->handler = trace__pgfault;
2147 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2149 const u32 type = event->header.type;
2150 struct perf_evsel *evsel;
2152 if (type != PERF_RECORD_SAMPLE) {
2153 trace__process_event(trace, trace->host, event, sample);
2157 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2158 if (evsel == NULL) {
2159 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2163 trace__set_base_time(trace, evsel, sample);
2165 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2166 sample->raw_data == NULL) {
2167 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2168 perf_evsel__name(evsel), sample->tid,
2169 sample->cpu, sample->raw_size);
2171 tracepoint_handler handler = evsel->handler;
2172 handler(trace, evsel, event, sample);
2176 static int trace__add_syscall_newtp(struct trace *trace)
2179 struct perf_evlist *evlist = trace->evlist;
2180 struct perf_evsel *sys_enter, *sys_exit;
2182 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2183 if (sys_enter == NULL)
2186 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2187 goto out_delete_sys_enter;
2189 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2190 if (sys_exit == NULL)
2191 goto out_delete_sys_enter;
2193 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2194 goto out_delete_sys_exit;
2196 perf_evlist__add(evlist, sys_enter);
2197 perf_evlist__add(evlist, sys_exit);
2199 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2201 * We're interested only in the user space callchain
2202 * leading to the syscall, allow overriding that for
2203 * debugging reasons using --kernel_syscall_callchains
2205 sys_exit->attr.exclude_callchain_kernel = 1;
2208 trace->syscalls.events.sys_enter = sys_enter;
2209 trace->syscalls.events.sys_exit = sys_exit;
2215 out_delete_sys_exit:
2216 perf_evsel__delete_priv(sys_exit);
2217 out_delete_sys_enter:
2218 perf_evsel__delete_priv(sys_enter);
2222 static int trace__set_ev_qualifier_filter(struct trace *trace)
2225 struct perf_evsel *sys_exit;
2226 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2227 trace->ev_qualifier_ids.nr,
2228 trace->ev_qualifier_ids.entries);
2233 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2235 sys_exit = trace->syscalls.events.sys_exit;
2236 err = perf_evsel__append_tp_filter(sys_exit, filter);
2247 static int trace__set_filter_loop_pids(struct trace *trace)
2249 unsigned int nr = 1;
2253 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2255 while (thread && nr < ARRAY_SIZE(pids)) {
2256 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2261 if (!strcmp(thread__comm_str(parent), "sshd")) {
2262 pids[nr++] = parent->tid;
2268 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2271 static int trace__run(struct trace *trace, int argc, const char **argv)
2273 struct perf_evlist *evlist = trace->evlist;
2274 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2276 unsigned long before;
2277 const bool forks = argc > 0;
2278 bool draining = false;
2282 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2283 goto out_error_raw_syscalls;
2285 if (trace->trace_syscalls)
2286 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2288 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2289 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2290 if (pgfault_maj == NULL)
2292 perf_evlist__add(evlist, pgfault_maj);
2295 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2296 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2297 if (pgfault_min == NULL)
2299 perf_evlist__add(evlist, pgfault_min);
2303 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2304 trace__sched_stat_runtime))
2305 goto out_error_sched_stat_runtime;
2307 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2309 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2310 goto out_delete_evlist;
2313 err = trace__symbols_init(trace, evlist);
2315 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2316 goto out_delete_evlist;
2319 perf_evlist__config(evlist, &trace->opts, NULL);
2321 if (callchain_param.enabled) {
2322 bool use_identifier = false;
2324 if (trace->syscalls.events.sys_exit) {
2325 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2326 &trace->opts, &callchain_param);
2327 use_identifier = true;
2331 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2332 use_identifier = true;
2336 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2337 use_identifier = true;
2340 if (use_identifier) {
2342 * Now we have evsels with different sample_ids, use
2343 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2344 * from a fixed position in each ring buffer record.
2346 * As of this the changeset introducing this comment, this
2347 * isn't strictly needed, as the fields that can come before
2348 * PERF_SAMPLE_ID are all used, but we'll probably disable
2349 * some of those for things like copying the payload of
2350 * pointer syscall arguments, and for vfs_getname we don't
2351 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2352 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2354 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2355 perf_evlist__reset_sample_bit(evlist, ID);
2359 signal(SIGCHLD, sig_handler);
2360 signal(SIGINT, sig_handler);
2363 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2366 fprintf(trace->output, "Couldn't run the workload!\n");
2367 goto out_delete_evlist;
2371 err = perf_evlist__open(evlist);
2373 goto out_error_open;
2375 err = bpf__apply_obj_config();
2377 char errbuf[BUFSIZ];
2379 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2380 pr_err("ERROR: Apply config to BPF failed: %s\n",
2382 goto out_error_open;
2386 * Better not use !target__has_task() here because we need to cover the
2387 * case where no threads were specified in the command line, but a
2388 * workload was, and in that case we will fill in the thread_map when
2389 * we fork the workload in perf_evlist__prepare_workload.
2391 if (trace->filter_pids.nr > 0)
2392 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2393 else if (thread_map__pid(evlist->threads, 0) == -1)
2394 err = trace__set_filter_loop_pids(trace);
2399 if (trace->ev_qualifier_ids.nr > 0) {
2400 err = trace__set_ev_qualifier_filter(trace);
2404 pr_debug("event qualifier tracepoint filter: %s\n",
2405 trace->syscalls.events.sys_exit->filter);
2408 err = perf_evlist__apply_filters(evlist, &evsel);
2410 goto out_error_apply_filters;
2412 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2414 goto out_error_mmap;
2416 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2417 perf_evlist__enable(evlist);
2420 perf_evlist__start_workload(evlist);
2422 if (trace->opts.initial_delay) {
2423 usleep(trace->opts.initial_delay * 1000);
2424 perf_evlist__enable(evlist);
2427 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2428 evlist->threads->nr > 1 ||
2429 perf_evlist__first(evlist)->attr.inherit;
2431 before = trace->nr_events;
2433 for (i = 0; i < evlist->nr_mmaps; i++) {
2434 union perf_event *event;
2436 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2437 struct perf_sample sample;
2441 err = perf_evlist__parse_sample(evlist, event, &sample);
2443 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2447 trace__handle_event(trace, event, &sample);
2449 perf_evlist__mmap_consume(evlist, i);
2454 if (done && !draining) {
2455 perf_evlist__disable(evlist);
2461 if (trace->nr_events == before) {
2462 int timeout = done ? 100 : -1;
2464 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2465 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2475 thread__zput(trace->current);
2477 perf_evlist__disable(evlist);
2481 trace__fprintf_thread_summary(trace, trace->output);
2483 if (trace->show_tool_stats) {
2484 fprintf(trace->output, "Stats:\n "
2485 " vfs_getname : %" PRIu64 "\n"
2486 " proc_getname: %" PRIu64 "\n",
2487 trace->stats.vfs_getname,
2488 trace->stats.proc_getname);
2493 trace__symbols__exit(trace);
2495 perf_evlist__delete(evlist);
2496 trace->evlist = NULL;
2497 trace->live = false;
2500 char errbuf[BUFSIZ];
2502 out_error_sched_stat_runtime:
2503 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2506 out_error_raw_syscalls:
2507 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2511 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2515 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2518 fprintf(trace->output, "%s\n", errbuf);
2519 goto out_delete_evlist;
2521 out_error_apply_filters:
2522 fprintf(trace->output,
2523 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2524 evsel->filter, perf_evsel__name(evsel), errno,
2525 str_error_r(errno, errbuf, sizeof(errbuf)));
2526 goto out_delete_evlist;
2529 fprintf(trace->output, "Not enough memory to run!\n");
2530 goto out_delete_evlist;
2533 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2534 goto out_delete_evlist;
2537 static int trace__replay(struct trace *trace)
2539 const struct perf_evsel_str_handler handlers[] = {
2540 { "probe:vfs_getname", trace__vfs_getname, },
2542 struct perf_data_file file = {
2544 .mode = PERF_DATA_MODE_READ,
2545 .force = trace->force,
2547 struct perf_session *session;
2548 struct perf_evsel *evsel;
2551 trace->tool.sample = trace__process_sample;
2552 trace->tool.mmap = perf_event__process_mmap;
2553 trace->tool.mmap2 = perf_event__process_mmap2;
2554 trace->tool.comm = perf_event__process_comm;
2555 trace->tool.exit = perf_event__process_exit;
2556 trace->tool.fork = perf_event__process_fork;
2557 trace->tool.attr = perf_event__process_attr;
2558 trace->tool.tracing_data = perf_event__process_tracing_data;
2559 trace->tool.build_id = perf_event__process_build_id;
2560 trace->tool.namespaces = perf_event__process_namespaces;
2562 trace->tool.ordered_events = true;
2563 trace->tool.ordering_requires_timestamps = true;
2565 /* add tid to output */
2566 trace->multiple_threads = true;
2568 session = perf_session__new(&file, false, &trace->tool);
2569 if (session == NULL)
2572 if (trace->opts.target.pid)
2573 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2575 if (trace->opts.target.tid)
2576 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2578 if (symbol__init(&session->header.env) < 0)
2581 trace->host = &session->machines.host;
2583 err = perf_session__set_tracepoints_handlers(session, handlers);
2587 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2588 "raw_syscalls:sys_enter");
2589 /* older kernels have syscalls tp versus raw_syscalls */
2591 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2592 "syscalls:sys_enter");
2595 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2596 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2597 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2601 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2602 "raw_syscalls:sys_exit");
2604 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2605 "syscalls:sys_exit");
2607 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2608 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2609 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2613 evlist__for_each_entry(session->evlist, evsel) {
2614 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2615 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2616 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2617 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2618 evsel->handler = trace__pgfault;
2623 err = perf_session__process_events(session);
2625 pr_err("Failed to process events, error %d", err);
2627 else if (trace->summary)
2628 trace__fprintf_thread_summary(trace, trace->output);
2631 perf_session__delete(session);
2636 static size_t trace__fprintf_threads_header(FILE *fp)
2640 printed = fprintf(fp, "\n Summary of events:\n\n");
2645 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2646 struct stats *stats;
2651 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2652 struct stats *stats = source->priv;
2654 entry->syscall = source->i;
2655 entry->stats = stats;
2656 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2659 static size_t thread__dump_stats(struct thread_trace *ttrace,
2660 struct trace *trace, FILE *fp)
2665 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2667 if (syscall_stats == NULL)
2670 printed += fprintf(fp, "\n");
2672 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2673 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2674 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2676 resort_rb__for_each_entry(nd, syscall_stats) {
2677 struct stats *stats = syscall_stats_entry->stats;
2679 double min = (double)(stats->min) / NSEC_PER_MSEC;
2680 double max = (double)(stats->max) / NSEC_PER_MSEC;
2681 double avg = avg_stats(stats);
2683 u64 n = (u64) stats->n;
2685 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2686 avg /= NSEC_PER_MSEC;
2688 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2689 printed += fprintf(fp, " %-15s", sc->name);
2690 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2691 n, syscall_stats_entry->msecs, min, avg);
2692 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2696 resort_rb__delete(syscall_stats);
2697 printed += fprintf(fp, "\n\n");
2702 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2705 struct thread_trace *ttrace = thread__priv(thread);
2711 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2713 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2714 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2715 printed += fprintf(fp, "%.1f%%", ratio);
2717 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2719 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2721 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2722 else if (fputc('\n', fp) != EOF)
2725 printed += thread__dump_stats(ttrace, trace, fp);
2730 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2732 return ttrace ? ttrace->nr_events : 0;
2735 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2736 struct thread *thread;
2739 entry->thread = rb_entry(nd, struct thread, rb_node);
2742 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2744 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2745 size_t printed = trace__fprintf_threads_header(fp);
2748 if (threads == NULL) {
2749 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2753 resort_rb__for_each_entry(nd, threads)
2754 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2756 resort_rb__delete(threads);
2761 static int trace__set_duration(const struct option *opt, const char *str,
2762 int unset __maybe_unused)
2764 struct trace *trace = opt->value;
2766 trace->duration_filter = atof(str);
2770 static int trace__set_filter_pids(const struct option *opt, const char *str,
2771 int unset __maybe_unused)
2775 struct trace *trace = opt->value;
2777 * FIXME: introduce a intarray class, plain parse csv and create a
2778 * { int nr, int entries[] } struct...
2780 struct intlist *list = intlist__new(str);
2785 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2786 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2788 if (trace->filter_pids.entries == NULL)
2791 trace->filter_pids.entries[0] = getpid();
2793 for (i = 1; i < trace->filter_pids.nr; ++i)
2794 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2796 intlist__delete(list);
2802 static int trace__open_output(struct trace *trace, const char *filename)
2806 if (!stat(filename, &st) && st.st_size) {
2807 char oldname[PATH_MAX];
2809 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2811 rename(filename, oldname);
2814 trace->output = fopen(filename, "w");
2816 return trace->output == NULL ? -errno : 0;
2819 static int parse_pagefaults(const struct option *opt, const char *str,
2820 int unset __maybe_unused)
2822 int *trace_pgfaults = opt->value;
2824 if (strcmp(str, "all") == 0)
2825 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2826 else if (strcmp(str, "maj") == 0)
2827 *trace_pgfaults |= TRACE_PFMAJ;
2828 else if (strcmp(str, "min") == 0)
2829 *trace_pgfaults |= TRACE_PFMIN;
2836 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2838 struct perf_evsel *evsel;
2840 evlist__for_each_entry(evlist, evsel)
2841 evsel->handler = handler;
2845 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2846 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2847 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2849 * It'd be better to introduce a parse_options() variant that would return a
2850 * list with the terms it didn't match to an event...
2852 static int trace__parse_events_option(const struct option *opt, const char *str,
2853 int unset __maybe_unused)
2855 struct trace *trace = (struct trace *)opt->value;
2856 const char *s = str;
2857 char *sep = NULL, *lists[2] = { NULL, NULL, };
2858 int len = strlen(str) + 1, err = -1, list, idx;
2859 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2860 char group_name[PATH_MAX];
2862 if (strace_groups_dir == NULL)
2867 trace->not_ev_qualifier = true;
2871 if ((sep = strchr(s, ',')) != NULL)
2875 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2876 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2879 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2880 if (access(group_name, R_OK) == 0)
2885 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2887 lists[list] = malloc(len);
2888 if (lists[list] == NULL)
2890 strcpy(lists[list], s);
2900 if (lists[1] != NULL) {
2901 struct strlist_config slist_config = {
2902 .dirname = strace_groups_dir,
2905 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2906 if (trace->ev_qualifier == NULL) {
2907 fputs("Not enough memory to parse event qualifier", trace->output);
2911 if (trace__validate_ev_qualifier(trace))
2918 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2919 "event selector. use 'perf list' to list available events",
2920 parse_events_option);
2921 err = parse_events_option(&o, lists[0], 0);
2930 int cmd_trace(int argc, const char **argv)
2932 const char *trace_usage[] = {
2933 "perf trace [<options>] [<command>]",
2934 "perf trace [<options>] -- <command> [<options>]",
2935 "perf trace record [<options>] [<command>]",
2936 "perf trace record [<options>] -- <command> [<options>]",
2939 struct trace trace = {
2948 .user_freq = UINT_MAX,
2949 .user_interval = ULLONG_MAX,
2950 .no_buffering = true,
2951 .mmap_pages = UINT_MAX,
2952 .proc_map_timeout = 500,
2956 .trace_syscalls = true,
2957 .kernel_syscallchains = false,
2958 .max_stack = UINT_MAX,
2960 const char *output_name = NULL;
2961 const struct option trace_options[] = {
2962 OPT_CALLBACK('e', "event", &trace, "event",
2963 "event/syscall selector. use 'perf list' to list available events",
2964 trace__parse_events_option),
2965 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2966 "show the thread COMM next to its id"),
2967 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2968 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2969 trace__parse_events_option),
2970 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2971 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2972 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2973 "trace events on existing process id"),
2974 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2975 "trace events on existing thread id"),
2976 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2977 "pids to filter (by the kernel)", trace__set_filter_pids),
2978 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2979 "system-wide collection from all CPUs"),
2980 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2981 "list of cpus to monitor"),
2982 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2983 "child tasks do not inherit counters"),
2984 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2985 "number of mmap data pages",
2986 perf_evlist__parse_mmap_pages),
2987 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2989 OPT_CALLBACK(0, "duration", &trace, "float",
2990 "show only events with duration > N.M ms",
2991 trace__set_duration),
2992 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2993 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2994 OPT_BOOLEAN('T', "time", &trace.full_time,
2995 "Show full timestamp, not time relative to first start"),
2996 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2997 "Show only syscall summary with statistics"),
2998 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2999 "Show all syscalls and summary with statistics"),
3000 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3001 "Trace pagefaults", parse_pagefaults, "maj"),
3002 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3003 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3004 OPT_CALLBACK(0, "call-graph", &trace.opts,
3005 "record_mode[,record_size]", record_callchain_help,
3006 &record_parse_callchain_opt),
3007 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3008 "Show the kernel callchains on the syscall exit path"),
3009 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3010 "Set the minimum stack depth when parsing the callchain, "
3011 "anything below the specified depth will be ignored."),
3012 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3013 "Set the maximum stack depth when parsing the callchain, "
3014 "anything beyond the specified depth will be ignored. "
3015 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3016 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3017 "per thread proc mmap processing timeout in ms"),
3018 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3019 "ms to wait before starting measurement after program "
3023 bool __maybe_unused max_stack_user_set = true;
3024 bool mmap_pages_user_set = true;
3025 const char * const trace_subcommands[] = { "record", NULL };
3029 signal(SIGSEGV, sighandler_dump_stack);
3030 signal(SIGFPE, sighandler_dump_stack);
3032 trace.evlist = perf_evlist__new();
3033 trace.sctbl = syscalltbl__new();
3035 if (trace.evlist == NULL || trace.sctbl == NULL) {
3036 pr_err("Not enough memory to run!\n");
3041 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3042 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3044 err = bpf__setup_stdout(trace.evlist);
3046 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3047 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3053 if (trace.trace_pgfaults) {
3054 trace.opts.sample_address = true;
3055 trace.opts.sample_time = true;
3058 if (trace.opts.mmap_pages == UINT_MAX)
3059 mmap_pages_user_set = false;
3061 if (trace.max_stack == UINT_MAX) {
3062 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3063 max_stack_user_set = false;
3066 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3067 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3068 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3071 if (callchain_param.enabled) {
3072 if (!mmap_pages_user_set && geteuid() == 0)
3073 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3075 symbol_conf.use_callchain = true;
3078 if (trace.evlist->nr_entries > 0)
3079 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3081 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3082 return trace__record(&trace, argc-1, &argv[1]);
3084 /* summary_only implies summary option, but don't overwrite summary if set */
3085 if (trace.summary_only)
3086 trace.summary = trace.summary_only;
3088 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3089 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3090 pr_err("Please specify something to trace.\n");
3094 if (!trace.trace_syscalls && trace.ev_qualifier) {
3095 pr_err("The -e option can't be used with --no-syscalls.\n");
3099 if (output_name != NULL) {
3100 err = trace__open_output(&trace, output_name);
3102 perror("failed to create output file");
3107 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3109 err = target__validate(&trace.opts.target);
3111 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3112 fprintf(trace.output, "%s", bf);
3116 err = target__parse_uid(&trace.opts.target);
3118 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3119 fprintf(trace.output, "%s", bf);
3123 if (!argc && target__none(&trace.opts.target))
3124 trace.opts.target.system_wide = true;
3127 err = trace__replay(&trace);
3129 err = trace__run(&trace, argc, argv);
3132 if (output_name != NULL)
3133 fclose(trace.output);