1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
74 #include <sys/types.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
84 struct switch_output {
97 struct mmap_cpu_mask maps;
98 struct mmap_cpu_mask affinity;
101 struct record_thread {
103 struct thread_mask *mask;
108 struct fdarray pollfd;
112 struct mmap **overwrite_maps;
114 unsigned long long samples;
115 unsigned long waking;
117 u64 bytes_transferred;
118 u64 bytes_compressed;
121 static __thread struct record_thread *thread;
124 THREAD_MSG__UNDEFINED = 0,
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
134 THREAD_SPEC__UNDEFINED = 0,
137 THREAD_SPEC__PACKAGE,
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 "undefined", "cpu", "core", "package", "numa", "user"
147 struct pollfd_index_map {
148 int evlist_pollfd_index;
149 int thread_pollfd_index;
153 struct perf_tool tool;
154 struct record_opts opts;
156 u64 thread_bytes_written;
157 struct perf_data data;
158 struct auxtrace_record *itr;
159 struct evlist *evlist;
160 struct perf_session *session;
161 struct evlist *sb_evlist;
164 bool switch_output_event_set;
167 bool no_buildid_cache;
168 bool no_buildid_cache_set;
171 bool timestamp_filename;
172 bool timestamp_boundary;
174 struct switch_output switch_output;
175 unsigned long long samples;
176 unsigned long output_max_size; /* = 0: unlimited */
177 struct perf_debuginfod debuginfod;
179 struct thread_mask *thread_masks;
180 struct record_thread *thread_data;
181 struct pollfd_index_map *index_map;
183 size_t index_map_cnt;
186 static volatile int done;
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
197 static inline pid_t gettid(void)
199 return (pid_t)syscall(__NR_gettid);
203 static int record__threads_enabled(struct record *rec)
205 return rec->opts.threads_spec;
208 static bool switch_output_signal(struct record *rec)
210 return rec->switch_output.signal &&
211 trigger_is_ready(&switch_output_trigger);
214 static bool switch_output_size(struct record *rec)
216 return rec->switch_output.size &&
217 trigger_is_ready(&switch_output_trigger) &&
218 (rec->bytes_written >= rec->switch_output.size);
221 static bool switch_output_time(struct record *rec)
223 return rec->switch_output.time &&
224 trigger_is_ready(&switch_output_trigger);
227 static u64 record__bytes_written(struct record *rec)
229 return rec->bytes_written + rec->thread_bytes_written;
232 static bool record__output_max_size_exceeded(struct record *rec)
234 return rec->output_max_size &&
235 (record__bytes_written(rec) >= rec->output_max_size);
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 void *bf, size_t size)
241 struct perf_data_file *file = &rec->session->data->file;
243 if (map && map->file)
246 if (perf_data_file__write(file, bf, size) < 0) {
247 pr_err("failed to write perf data, error: %m\n");
251 if (map && map->file) {
252 thread->bytes_written += size;
253 rec->thread_bytes_written += size;
255 rec->bytes_written += size;
258 if (record__output_max_size_exceeded(rec) && !done) {
259 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 " stopping session ]\n",
261 record__bytes_written(rec) >> 10);
265 if (switch_output_size(rec))
266 trigger_hit(&switch_output_trigger);
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
274 void *dst, size_t dst_size, void *src, size_t src_size);
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 void *buf, size_t size, off_t off)
282 cblock->aio_fildes = trace_fd;
283 cblock->aio_buf = buf;
284 cblock->aio_nbytes = size;
285 cblock->aio_offset = off;
286 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
289 rc = aio_write(cblock);
292 } else if (errno != EAGAIN) {
293 cblock->aio_fildes = -1;
294 pr_err("failed to queue perf data, error: %m\n");
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
308 ssize_t aio_ret, written;
310 aio_errno = aio_error(cblock);
311 if (aio_errno == EINPROGRESS)
314 written = aio_ret = aio_return(cblock);
316 if (aio_errno != EINTR)
317 pr_err("failed to write perf data, error: %m\n");
321 rem_size = cblock->aio_nbytes - written;
324 cblock->aio_fildes = -1;
326 * md->refcount is incremented in record__aio_pushfn() for
327 * every aio write request started in record__aio_push() so
328 * decrement it because the request is now complete.
330 perf_mmap__put(&md->core);
334 * aio write request may require restart with the
335 * reminder if the kernel didn't write whole
338 rem_off = cblock->aio_offset + written;
339 rem_buf = (void *)(cblock->aio_buf + written);
340 record__aio_write(cblock, cblock->aio_fildes,
341 rem_buf, rem_size, rem_off);
348 static int record__aio_sync(struct mmap *md, bool sync_all)
350 struct aiocb **aiocb = md->aio.aiocb;
351 struct aiocb *cblocks = md->aio.cblocks;
352 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
357 for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
365 * Started aio write is not complete yet
366 * so it has to be waited before the
369 aiocb[i] = &cblocks[i];
376 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 if (!(errno == EAGAIN || errno == EINTR))
378 pr_err("failed to sync perf data, error: %m\n");
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
391 struct record_aio *aio = to;
394 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 * to release space in the kernel buffer as fast as possible, calling
396 * perf_mmap__consume() from perf_mmap__push() function.
398 * That lets the kernel to proceed with storing more profiling data into
399 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
401 * Coping can be done in two steps in case the chunk of profiling data
402 * crosses the upper bound of the kernel buffer. In this case we first move
403 * part of data from map->start till the upper bound and then the reminder
404 * from the beginning of the kernel buffer till the end of the data chunk.
407 if (record__comp_enabled(aio->rec)) {
408 size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 mmap__mmap_len(map) - aio->size,
412 memcpy(aio->data + aio->size, buf, size);
417 * Increment map->refcount to guard map->aio.data[] buffer
418 * from premature deallocation because map object can be
419 * released earlier than aio write request started on
420 * map->aio.data[] buffer is complete.
422 * perf_mmap__put() is done at record__aio_complete()
423 * after started aio request completion or at record__aio_push()
424 * if the request failed to start.
426 perf_mmap__get(&map->core);
434 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
437 int trace_fd = rec->session->data->file.fd;
438 struct record_aio aio = { .rec = rec, .size = 0 };
441 * Call record__aio_sync() to wait till map->aio.data[] buffer
442 * becomes available after previous aio write operation.
445 idx = record__aio_sync(map, false);
446 aio.data = map->aio.data[idx];
447 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
448 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
452 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
455 rec->bytes_written += aio.size;
456 if (switch_output_size(rec))
457 trigger_hit(&switch_output_trigger);
460 * Decrement map->refcount incremented in record__aio_pushfn()
461 * back if record__aio_write() operation failed to start, otherwise
462 * map->refcount is decremented in record__aio_complete() after
463 * aio write operation finishes successfully.
465 perf_mmap__put(&map->core);
471 static off_t record__aio_get_pos(int trace_fd)
473 return lseek(trace_fd, 0, SEEK_CUR);
476 static void record__aio_set_pos(int trace_fd, off_t pos)
478 lseek(trace_fd, pos, SEEK_SET);
481 static void record__aio_mmap_read_sync(struct record *rec)
484 struct evlist *evlist = rec->evlist;
485 struct mmap *maps = evlist->mmap;
487 if (!record__aio_enabled(rec))
490 for (i = 0; i < evlist->core.nr_mmaps; i++) {
491 struct mmap *map = &maps[i];
494 record__aio_sync(map, true);
498 static int nr_cblocks_default = 1;
499 static int nr_cblocks_max = 4;
501 static int record__aio_parse(const struct option *opt,
505 struct record_opts *opts = (struct record_opts *)opt->value;
508 opts->nr_cblocks = 0;
511 opts->nr_cblocks = strtol(str, NULL, 0);
512 if (!opts->nr_cblocks)
513 opts->nr_cblocks = nr_cblocks_default;
518 #else /* HAVE_AIO_SUPPORT */
519 static int nr_cblocks_max = 0;
521 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
522 off_t *off __maybe_unused)
527 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
536 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 static int record__aio_enabled(struct record *rec)
543 return rec->opts.nr_cblocks > 0;
546 #define MMAP_FLUSH_DEFAULT 1
547 static int record__mmap_flush_parse(const struct option *opt,
552 struct record_opts *opts = (struct record_opts *)opt->value;
553 static struct parse_tag tags[] = {
554 { .tag = 'B', .mult = 1 },
555 { .tag = 'K', .mult = 1 << 10 },
556 { .tag = 'M', .mult = 1 << 20 },
557 { .tag = 'G', .mult = 1 << 30 },
565 opts->mmap_flush = parse_tag_value(str, tags);
566 if (opts->mmap_flush == (int)-1)
567 opts->mmap_flush = strtol(str, NULL, 0);
570 if (!opts->mmap_flush)
571 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
573 flush_max = evlist__mmap_size(opts->mmap_pages);
575 if (opts->mmap_flush > flush_max)
576 opts->mmap_flush = flush_max;
581 #ifdef HAVE_ZSTD_SUPPORT
582 static unsigned int comp_level_default = 1;
584 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
586 struct record_opts *opts = opt->value;
589 opts->comp_level = 0;
592 opts->comp_level = strtol(str, NULL, 0);
593 if (!opts->comp_level)
594 opts->comp_level = comp_level_default;
600 static unsigned int comp_level_max = 22;
602 static int record__comp_enabled(struct record *rec)
604 return rec->opts.comp_level > 0;
607 static int process_synthesized_event(struct perf_tool *tool,
608 union perf_event *event,
609 struct perf_sample *sample __maybe_unused,
610 struct machine *machine __maybe_unused)
612 struct record *rec = container_of(tool, struct record, tool);
613 return record__write(rec, NULL, event, event->header.size);
616 static struct mutex synth_lock;
618 static int process_locked_synthesized_event(struct perf_tool *tool,
619 union perf_event *event,
620 struct perf_sample *sample __maybe_unused,
621 struct machine *machine __maybe_unused)
625 mutex_lock(&synth_lock);
626 ret = process_synthesized_event(tool, event, sample, machine);
627 mutex_unlock(&synth_lock);
631 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
633 struct record *rec = to;
635 if (record__comp_enabled(rec)) {
636 size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
641 return record__write(rec, map, bf, size);
644 static volatile sig_atomic_t signr = -1;
645 static volatile sig_atomic_t child_finished;
646 #ifdef HAVE_EVENTFD_SUPPORT
647 static volatile sig_atomic_t done_fd = -1;
650 static void sig_handler(int sig)
658 #ifdef HAVE_EVENTFD_SUPPORT
661 int orig_errno = errno;
664 * It is possible for this signal handler to run after done is
665 * checked in the main loop, but before the perf counter fds are
666 * polled. If this happens, the poll() will continue to wait
667 * even though done is set, and will only break out if either
668 * another signal is received, or the counters are ready for
669 * read. To ensure the poll() doesn't sleep when done is set,
670 * use an eventfd (done_fd) to wake up the poll().
672 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
673 pr_err("failed to signal wakeup fd, error: %m\n");
677 #endif // HAVE_EVENTFD_SUPPORT
680 static void sigsegv_handler(int sig)
682 perf_hooks__recover();
683 sighandler_dump_stack(sig);
686 static void record__sig_exit(void)
691 signal(signr, SIG_DFL);
695 #ifdef HAVE_AUXTRACE_SUPPORT
697 static int record__process_auxtrace(struct perf_tool *tool,
699 union perf_event *event, void *data1,
700 size_t len1, void *data2, size_t len2)
702 struct record *rec = container_of(tool, struct record, tool);
703 struct perf_data *data = &rec->data;
707 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
709 int fd = perf_data__fd(data);
712 file_offset = lseek(fd, 0, SEEK_CUR);
713 if (file_offset == -1)
715 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
721 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722 padding = (len1 + len2) & 7;
724 padding = 8 - padding;
726 record__write(rec, map, event, event->header.size);
727 record__write(rec, map, data1, len1);
729 record__write(rec, map, data2, len2);
730 record__write(rec, map, &pad, padding);
735 static int record__auxtrace_mmap_read(struct record *rec,
740 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741 record__process_auxtrace);
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
756 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757 record__process_auxtrace,
758 rec->opts.auxtrace_snapshot_size);
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
773 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774 struct mmap *map = &rec->evlist->mmap[i];
776 if (!map->auxtrace_mmap.base)
779 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
790 pr_debug("Recording AUX area tracing snapshot\n");
791 if (record__auxtrace_read_snapshot_all(rec) < 0) {
792 trigger_error(&auxtrace_snapshot_trigger);
794 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795 trigger_error(&auxtrace_snapshot_trigger);
797 trigger_ready(&auxtrace_snapshot_trigger);
801 static int record__auxtrace_snapshot_exit(struct record *rec)
803 if (trigger_is_error(&auxtrace_snapshot_trigger))
806 if (!auxtrace_record__snapshot_started &&
807 auxtrace_record__snapshot_start(rec->itr))
810 record__read_auxtrace_snapshot(rec, true);
811 if (trigger_is_error(&auxtrace_snapshot_trigger))
817 static int record__auxtrace_init(struct record *rec)
821 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822 && record__threads_enabled(rec)) {
823 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
828 rec->itr = auxtrace_record__init(rec->evlist, &err);
833 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834 rec->opts.auxtrace_snapshot_opts);
838 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839 rec->opts.auxtrace_sample_opts);
843 auxtrace_regroup_aux_output(rec->evlist);
845 return auxtrace_parse_filters(rec->evlist);
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852 struct mmap *map __maybe_unused)
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859 bool on_exit __maybe_unused)
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
882 static int record__config_text_poke(struct evlist *evlist)
886 /* Nothing to do if text poke is already configured */
887 evlist__for_each_entry(evlist, evsel) {
888 if (evsel->core.attr.text_poke)
892 evsel = evlist__add_dummy_on_all_cpus(evlist);
896 evsel->core.attr.text_poke = 1;
897 evsel->core.attr.ksymbol = 1;
898 evsel->immediate = true;
899 evsel__set_sample_bit(evsel, TIME);
904 static int record__config_off_cpu(struct record *rec)
906 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
909 static bool record__kcore_readable(struct machine *machine)
911 char kcore[PATH_MAX];
914 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
916 fd = open(kcore, O_RDONLY);
925 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
927 char from_dir[PATH_MAX];
928 char kcore_dir[PATH_MAX];
931 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
933 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
937 return kcore_copy(from_dir, kcore_dir);
940 static void record__thread_data_init_pipes(struct record_thread *thread_data)
942 thread_data->pipes.msg[0] = -1;
943 thread_data->pipes.msg[1] = -1;
944 thread_data->pipes.ack[0] = -1;
945 thread_data->pipes.ack[1] = -1;
948 static int record__thread_data_open_pipes(struct record_thread *thread_data)
950 if (pipe(thread_data->pipes.msg))
953 if (pipe(thread_data->pipes.ack)) {
954 close(thread_data->pipes.msg[0]);
955 thread_data->pipes.msg[0] = -1;
956 close(thread_data->pipes.msg[1]);
957 thread_data->pipes.msg[1] = -1;
961 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
962 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
963 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
968 static void record__thread_data_close_pipes(struct record_thread *thread_data)
970 if (thread_data->pipes.msg[0] != -1) {
971 close(thread_data->pipes.msg[0]);
972 thread_data->pipes.msg[0] = -1;
974 if (thread_data->pipes.msg[1] != -1) {
975 close(thread_data->pipes.msg[1]);
976 thread_data->pipes.msg[1] = -1;
978 if (thread_data->pipes.ack[0] != -1) {
979 close(thread_data->pipes.ack[0]);
980 thread_data->pipes.ack[0] = -1;
982 if (thread_data->pipes.ack[1] != -1) {
983 close(thread_data->pipes.ack[1]);
984 thread_data->pipes.ack[1] = -1;
988 static bool evlist__per_thread(struct evlist *evlist)
990 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
993 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
995 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
996 struct mmap *mmap = evlist->mmap;
997 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
998 struct perf_cpu_map *cpus = evlist->core.all_cpus;
999 bool per_thread = evlist__per_thread(evlist);
1002 thread_data->nr_mmaps = nr_mmaps;
1004 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1005 thread_data->mask->maps.nbits);
1007 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1008 if (!thread_data->maps)
1011 if (overwrite_mmap) {
1012 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 if (!thread_data->overwrite_maps) {
1014 zfree(&thread_data->maps);
1018 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1019 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1021 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1023 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1024 if (thread_data->maps) {
1025 thread_data->maps[tm] = &mmap[m];
1026 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1027 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1029 if (thread_data->overwrite_maps) {
1030 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1031 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1032 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1041 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1044 struct mmap *map, *overwrite_map;
1046 fdarray__init(&thread_data->pollfd, 64);
1048 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1049 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1050 overwrite_map = thread_data->overwrite_maps ?
1051 thread_data->overwrite_maps[tm] : NULL;
1053 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1054 void *ptr = evlist->core.pollfd.priv[f].ptr;
1056 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1057 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1058 &evlist->core.pollfd);
1061 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1062 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1070 static void record__free_thread_data(struct record *rec)
1073 struct record_thread *thread_data = rec->thread_data;
1075 if (thread_data == NULL)
1078 for (t = 0; t < rec->nr_threads; t++) {
1079 record__thread_data_close_pipes(&thread_data[t]);
1080 zfree(&thread_data[t].maps);
1081 zfree(&thread_data[t].overwrite_maps);
1082 fdarray__exit(&thread_data[t].pollfd);
1085 zfree(&rec->thread_data);
1088 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1089 int evlist_pollfd_index,
1090 int thread_pollfd_index)
1092 size_t x = rec->index_map_cnt;
1094 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1096 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1097 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1098 rec->index_map_cnt += 1;
1102 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1103 struct evlist *evlist,
1104 struct record_thread *thread_data)
1106 struct pollfd *e_entries = evlist->core.pollfd.entries;
1107 struct pollfd *t_entries = thread_data->pollfd.entries;
1111 for (i = 0; i < rec->index_map_cnt; i++) {
1112 int e_pos = rec->index_map[i].evlist_pollfd_index;
1113 int t_pos = rec->index_map[i].thread_pollfd_index;
1115 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1116 e_entries[e_pos].events != t_entries[t_pos].events) {
1117 pr_err("Thread and evlist pollfd index mismatch\n");
1121 e_entries[e_pos].revents = t_entries[t_pos].revents;
1126 static int record__dup_non_perf_events(struct record *rec,
1127 struct evlist *evlist,
1128 struct record_thread *thread_data)
1130 struct fdarray *fda = &evlist->core.pollfd;
1133 for (i = 0; i < fda->nr; i++) {
1134 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1136 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1138 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1141 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1142 thread_data, ret, fda->entries[i].fd);
1143 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1145 pr_err("Failed to map thread and evlist pollfd indexes\n");
1152 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1155 struct record_thread *thread_data;
1157 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1158 if (!rec->thread_data) {
1159 pr_err("Failed to allocate thread data\n");
1162 thread_data = rec->thread_data;
1164 for (t = 0; t < rec->nr_threads; t++)
1165 record__thread_data_init_pipes(&thread_data[t]);
1167 for (t = 0; t < rec->nr_threads; t++) {
1168 thread_data[t].rec = rec;
1169 thread_data[t].mask = &rec->thread_masks[t];
1170 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1172 pr_err("Failed to initialize thread[%d] maps\n", t);
1175 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1177 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1181 thread_data[t].tid = -1;
1182 ret = record__thread_data_open_pipes(&thread_data[t]);
1184 pr_err("Failed to open thread[%d] communication pipes\n", t);
1187 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1188 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1190 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1193 thread_data[t].ctlfd_pos = ret;
1194 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1195 thread_data, thread_data[t].ctlfd_pos,
1196 thread_data[t].pipes.msg[0]);
1198 thread_data[t].tid = gettid();
1200 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1204 thread_data[t].ctlfd_pos = -1; /* Not used */
1211 record__free_thread_data(rec);
1216 static int record__mmap_evlist(struct record *rec,
1217 struct evlist *evlist)
1220 struct record_opts *opts = &rec->opts;
1221 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1222 opts->auxtrace_sample_mode;
1225 if (opts->affinity != PERF_AFFINITY_SYS)
1226 cpu__setup_cpunode_map();
1228 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1229 opts->auxtrace_mmap_pages,
1231 opts->nr_cblocks, opts->affinity,
1232 opts->mmap_flush, opts->comp_level) < 0) {
1233 if (errno == EPERM) {
1234 pr_err("Permission error mapping pages.\n"
1235 "Consider increasing "
1236 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1237 "or try again with a smaller value of -m/--mmap_pages.\n"
1238 "(current value: %u,%u)\n",
1239 opts->mmap_pages, opts->auxtrace_mmap_pages);
1242 pr_err("failed to mmap with %d (%s)\n", errno,
1243 str_error_r(errno, msg, sizeof(msg)));
1251 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1254 ret = record__alloc_thread_data(rec, evlist);
1258 if (record__threads_enabled(rec)) {
1259 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1261 pr_err("Failed to create data directory: %s\n", strerror(-ret));
1264 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1266 evlist->mmap[i].file = &rec->data.dir.files[i];
1267 if (evlist->overwrite_mmap)
1268 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1275 static int record__mmap(struct record *rec)
1277 return record__mmap_evlist(rec, rec->evlist);
1280 static int record__open(struct record *rec)
1284 struct evlist *evlist = rec->evlist;
1285 struct perf_session *session = rec->session;
1286 struct record_opts *opts = &rec->opts;
1290 * For initial_delay, system wide or a hybrid system, we need to add a
1291 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1292 * of waiting or event synthesis.
1294 if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
1295 perf_pmus__num_core_pmus() > 1) {
1296 pos = evlist__get_tracking_event(evlist);
1297 if (!evsel__is_dummy_event(pos)) {
1298 /* Set up dummy event. */
1299 if (evlist__add_dummy(evlist))
1301 pos = evlist__last(evlist);
1302 evlist__set_tracking_event(evlist, pos);
1306 * Enable the dummy event when the process is forked for
1307 * initial_delay, immediately for system wide.
1309 if (opts->target.initial_delay && !pos->immediate &&
1310 !target__has_cpu(&opts->target))
1311 pos->core.attr.enable_on_exec = 1;
1316 evlist__config(evlist, opts, &callchain_param);
1318 evlist__for_each_entry(evlist, pos) {
1320 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1321 if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1323 ui__warning("%s\n", msg);
1326 if ((errno == EINVAL || errno == EBADF) &&
1327 pos->core.leader != &pos->core &&
1329 pos = evlist__reset_weak_group(evlist, pos, true);
1333 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1334 ui__error("%s\n", msg);
1338 pos->supported = true;
1341 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1343 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1344 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1345 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1346 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1347 "Samples in kernel modules won't be resolved at all.\n\n"
1348 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1349 "even with a suitable vmlinux or kallsyms file.\n\n");
1352 if (evlist__apply_filters(evlist, &pos)) {
1353 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1354 pos->filter ?: "BPF", evsel__name(pos), errno,
1355 str_error_r(errno, msg, sizeof(msg)));
1360 rc = record__mmap(rec);
1364 session->evlist = evlist;
1365 perf_session__set_id_hdr_size(session);
1370 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1372 if (rec->evlist->first_sample_time == 0)
1373 rec->evlist->first_sample_time = sample_time;
1376 rec->evlist->last_sample_time = sample_time;
1379 static int process_sample_event(struct perf_tool *tool,
1380 union perf_event *event,
1381 struct perf_sample *sample,
1382 struct evsel *evsel,
1383 struct machine *machine)
1385 struct record *rec = container_of(tool, struct record, tool);
1387 set_timestamp_boundary(rec, sample->time);
1389 if (rec->buildid_all)
1393 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1396 static int process_buildids(struct record *rec)
1398 struct perf_session *session = rec->session;
1400 if (perf_data__size(&rec->data) == 0)
1404 * During this process, it'll load kernel map and replace the
1405 * dso->long_name to a real pathname it found. In this case
1406 * we prefer the vmlinux path like
1407 * /lib/modules/3.16.4/build/vmlinux
1409 * rather than build-id path (in debug directory).
1410 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1412 symbol_conf.ignore_vmlinux_buildid = true;
1415 * If --buildid-all is given, it marks all DSO regardless of hits,
1416 * so no need to process samples. But if timestamp_boundary is enabled,
1417 * it still needs to walk on all samples to get the timestamps of
1418 * first/last samples.
1420 if (rec->buildid_all && !rec->timestamp_boundary)
1421 rec->tool.sample = NULL;
1423 return perf_session__process_events(session);
1426 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1429 struct perf_tool *tool = data;
1431 *As for guest kernel when processing subcommand record&report,
1432 *we arrange module mmap prior to guest kernel mmap and trigger
1433 *a preload dso because default guest module symbols are loaded
1434 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1435 *method is used to avoid symbol missing when the first addr is
1436 *in module instead of in guest kernel.
1438 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1441 pr_err("Couldn't record guest kernel [%d]'s reference"
1442 " relocation symbol.\n", machine->pid);
1445 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1446 * have no _text sometimes.
1448 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1451 pr_err("Couldn't record guest kernel [%d]'s reference"
1452 " relocation symbol.\n", machine->pid);
1455 static struct perf_event_header finished_round_event = {
1456 .size = sizeof(struct perf_event_header),
1457 .type = PERF_RECORD_FINISHED_ROUND,
1460 static struct perf_event_header finished_init_event = {
1461 .size = sizeof(struct perf_event_header),
1462 .type = PERF_RECORD_FINISHED_INIT,
1465 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1467 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1468 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1469 thread->mask->affinity.nbits)) {
1470 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1471 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1472 map->affinity_mask.bits, thread->mask->affinity.nbits);
1473 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1474 (cpu_set_t *)thread->mask->affinity.bits);
1476 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1477 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1482 static size_t process_comp_header(void *record, size_t increment)
1484 struct perf_record_compressed *event = record;
1485 size_t size = sizeof(*event);
1488 event->header.size += increment;
1492 event->header.type = PERF_RECORD_COMPRESSED;
1493 event->header.size = size;
1498 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1499 void *dst, size_t dst_size, void *src, size_t src_size)
1502 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1503 struct zstd_data *zstd_data = &session->zstd_data;
1505 if (map && map->file)
1506 zstd_data = &map->zstd_data;
1508 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1509 max_record_size, process_comp_header);
1511 if (map && map->file) {
1512 thread->bytes_transferred += src_size;
1513 thread->bytes_compressed += compressed;
1515 session->bytes_transferred += src_size;
1516 session->bytes_compressed += compressed;
1522 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1523 bool overwrite, bool synch)
1525 u64 bytes_written = rec->bytes_written;
1530 int trace_fd = rec->data.file.fd;
1536 nr_mmaps = thread->nr_mmaps;
1537 maps = overwrite ? thread->overwrite_maps : thread->maps;
1542 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1545 if (record__aio_enabled(rec))
1546 off = record__aio_get_pos(trace_fd);
1548 for (i = 0; i < nr_mmaps; i++) {
1550 struct mmap *map = maps[i];
1552 if (map->core.base) {
1553 record__adjust_affinity(rec, map);
1555 flush = map->core.flush;
1556 map->core.flush = 1;
1558 if (!record__aio_enabled(rec)) {
1559 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1561 map->core.flush = flush;
1566 if (record__aio_push(rec, map, &off) < 0) {
1567 record__aio_set_pos(trace_fd, off);
1569 map->core.flush = flush;
1575 map->core.flush = flush;
1578 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1579 !rec->opts.auxtrace_sample_mode &&
1580 record__auxtrace_mmap_read(rec, map) != 0) {
1586 if (record__aio_enabled(rec))
1587 record__aio_set_pos(trace_fd, off);
1590 * Mark the round finished in case we wrote
1591 * at least one event.
1593 * No need for round events in directory mode,
1594 * because per-cpu maps and files have data
1597 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1598 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1601 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1606 static int record__mmap_read_all(struct record *rec, bool synch)
1610 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1614 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1617 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1618 void *arg __maybe_unused)
1620 struct perf_mmap *map = fda->priv[fd].ptr;
1623 perf_mmap__put(map);
1626 static void *record__thread(void *arg)
1628 enum thread_msg msg = THREAD_MSG__READY;
1629 bool terminate = false;
1630 struct fdarray *pollfd;
1634 thread->tid = gettid();
1636 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1638 pr_warning("threads[%d]: failed to notify on start: %s\n",
1639 thread->tid, strerror(errno));
1641 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1643 pollfd = &thread->pollfd;
1644 ctlfd_pos = thread->ctlfd_pos;
1647 unsigned long long hits = thread->samples;
1649 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1652 if (hits == thread->samples) {
1654 err = fdarray__poll(pollfd, -1);
1656 * Propagate error, only if there's any. Ignore positive
1657 * number of returned events and interrupt error.
1659 if (err > 0 || (err < 0 && errno == EINTR))
1663 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1664 record__thread_munmap_filtered, NULL) == 0)
1668 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1670 close(thread->pipes.msg[0]);
1671 thread->pipes.msg[0] = -1;
1672 pollfd->entries[ctlfd_pos].fd = -1;
1673 pollfd->entries[ctlfd_pos].events = 0;
1676 pollfd->entries[ctlfd_pos].revents = 0;
1678 record__mmap_read_all(thread->rec, true);
1680 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1682 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1683 thread->tid, strerror(errno));
1688 static void record__init_features(struct record *rec)
1690 struct perf_session *session = rec->session;
1693 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1694 perf_header__set_feat(&session->header, feat);
1696 if (rec->no_buildid)
1697 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1699 #ifdef HAVE_LIBTRACEEVENT
1700 if (!have_tracepoints(&rec->evlist->core.entries))
1701 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1704 if (!rec->opts.branch_stack)
1705 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1707 if (!rec->opts.full_auxtrace)
1708 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1710 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1711 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1713 if (!rec->opts.use_clockid)
1714 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1716 if (!record__threads_enabled(rec))
1717 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1719 if (!record__comp_enabled(rec))
1720 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1722 perf_header__clear_feat(&session->header, HEADER_STAT);
1726 record__finish_output(struct record *rec)
1729 struct perf_data *data = &rec->data;
1730 int fd = perf_data__fd(data);
1735 rec->session->header.data_size += rec->bytes_written;
1736 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1737 if (record__threads_enabled(rec)) {
1738 for (i = 0; i < data->dir.nr; i++)
1739 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1742 if (!rec->no_buildid) {
1743 process_buildids(rec);
1745 if (rec->buildid_all)
1746 dsos__hit_all(rec->session);
1748 perf_session__write_header(rec->session, rec->evlist, fd, true);
1753 static int record__synthesize_workload(struct record *rec, bool tail)
1756 struct perf_thread_map *thread_map;
1757 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1759 if (rec->opts.tail_synthesize != tail)
1762 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1763 if (thread_map == NULL)
1766 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1767 process_synthesized_event,
1768 &rec->session->machines.host,
1770 rec->opts.sample_address);
1771 perf_thread_map__put(thread_map);
1775 static int write_finished_init(struct record *rec, bool tail)
1777 if (rec->opts.tail_synthesize != tail)
1780 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1783 static int record__synthesize(struct record *rec, bool tail);
1786 record__switch_output(struct record *rec, bool at_exit)
1788 struct perf_data *data = &rec->data;
1792 /* Same Size: "2015122520103046"*/
1793 char timestamp[] = "InvalidTimestamp";
1795 record__aio_mmap_read_sync(rec);
1797 write_finished_init(rec, true);
1799 record__synthesize(rec, true);
1800 if (target__none(&rec->opts.target))
1801 record__synthesize_workload(rec, true);
1804 record__finish_output(rec);
1805 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1807 pr_err("Failed to get current timestamp\n");
1811 fd = perf_data__switch(data, timestamp,
1812 rec->session->header.data_offset,
1813 at_exit, &new_filename);
1814 if (fd >= 0 && !at_exit) {
1815 rec->bytes_written = 0;
1816 rec->session->header.data_size = 0;
1820 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1821 data->path, timestamp);
1823 if (rec->switch_output.num_files) {
1824 int n = rec->switch_output.cur_file + 1;
1826 if (n >= rec->switch_output.num_files)
1828 rec->switch_output.cur_file = n;
1829 if (rec->switch_output.filenames[n]) {
1830 remove(rec->switch_output.filenames[n]);
1831 zfree(&rec->switch_output.filenames[n]);
1833 rec->switch_output.filenames[n] = new_filename;
1838 /* Output tracking events */
1840 record__synthesize(rec, false);
1843 * In 'perf record --switch-output' without -a,
1844 * record__synthesize() in record__switch_output() won't
1845 * generate tracking events because there's no thread_map
1846 * in evlist. Which causes newly created perf.data doesn't
1847 * contain map and comm information.
1848 * Create a fake thread_map and directly call
1849 * perf_event__synthesize_thread_map() for those events.
1851 if (target__none(&rec->opts.target))
1852 record__synthesize_workload(rec, false);
1853 write_finished_init(rec, false);
1858 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1859 struct perf_record_lost_samples *lost,
1860 int cpu_idx, int thread_idx, u64 lost_count,
1863 struct perf_sample_id *sid;
1864 struct perf_sample sample = {};
1867 lost->lost = lost_count;
1868 if (evsel->core.ids) {
1869 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1870 sample.id = sid->id;
1873 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1874 evsel->core.attr.sample_type, &sample);
1875 lost->header.size = sizeof(*lost) + id_hdr_size;
1876 lost->header.misc = misc_flag;
1877 record__write(rec, NULL, lost, lost->header.size);
1880 static void record__read_lost_samples(struct record *rec)
1882 struct perf_session *session = rec->session;
1883 struct perf_record_lost_samples *lost;
1884 struct evsel *evsel;
1886 /* there was an error during record__open */
1887 if (session->evlist == NULL)
1890 lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1892 pr_debug("Memory allocation failed\n");
1896 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1898 evlist__for_each_entry(session->evlist, evsel) {
1899 struct xyarray *xy = evsel->core.sample_id;
1902 if (xy == NULL || evsel->core.fd == NULL)
1904 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1905 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1906 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1910 for (int x = 0; x < xyarray__max_x(xy); x++) {
1911 for (int y = 0; y < xyarray__max_y(xy); y++) {
1912 struct perf_counts_values count;
1914 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1915 pr_debug("read LOST count failed\n");
1920 __record__save_lost_samples(rec, evsel, lost,
1921 x, y, count.lost, 0);
1926 lost_count = perf_bpf_filter__lost_count(evsel);
1928 __record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1929 PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1935 static volatile sig_atomic_t workload_exec_errno;
1938 * evlist__prepare_workload will send a SIGUSR1
1939 * if the fork fails, since we asked by setting its
1940 * want_signal to true.
1942 static void workload_exec_failed_signal(int signo __maybe_unused,
1944 void *ucontext __maybe_unused)
1946 workload_exec_errno = info->si_value.sival_int;
1951 static void snapshot_sig_handler(int sig);
1952 static void alarm_sig_handler(int sig);
1954 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1957 if (evlist->mmap && evlist->mmap[0].core.base)
1958 return evlist->mmap[0].core.base;
1959 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1960 return evlist->overwrite_mmap[0].core.base;
1965 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1967 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1973 static int record__synthesize(struct record *rec, bool tail)
1975 struct perf_session *session = rec->session;
1976 struct machine *machine = &session->machines.host;
1977 struct perf_data *data = &rec->data;
1978 struct record_opts *opts = &rec->opts;
1979 struct perf_tool *tool = &rec->tool;
1981 event_op f = process_synthesized_event;
1983 if (rec->opts.tail_synthesize != tail)
1986 if (data->is_pipe) {
1987 err = perf_event__synthesize_for_pipe(tool, session, data,
1988 process_synthesized_event);
1992 rec->bytes_written += err;
1995 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1996 process_synthesized_event, machine);
2000 /* Synthesize id_index before auxtrace_info */
2001 err = perf_event__synthesize_id_index(tool,
2002 process_synthesized_event,
2003 session->evlist, machine);
2007 if (rec->opts.full_auxtrace) {
2008 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2009 session, process_synthesized_event);
2014 if (!evlist__exclude_kernel(rec->evlist)) {
2015 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2017 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2018 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019 "Check /proc/kallsyms permission or run as root.\n");
2021 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2023 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2024 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2025 "Check /proc/modules permission or run as root.\n");
2029 machines__process_guests(&session->machines,
2030 perf_event__synthesize_guest_os, tool);
2033 err = perf_event__synthesize_extra_attr(&rec->tool,
2035 process_synthesized_event,
2040 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2041 process_synthesized_event,
2044 pr_err("Couldn't synthesize thread map.\n");
2048 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2049 process_synthesized_event, NULL);
2051 pr_err("Couldn't synthesize cpu map.\n");
2055 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2058 pr_warning("Couldn't synthesize bpf events.\n");
2062 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2063 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2066 pr_warning("Couldn't synthesize cgroup events.\n");
2071 if (rec->opts.nr_threads_synthesize > 1) {
2072 mutex_init(&synth_lock);
2073 perf_set_multithreaded();
2074 f = process_locked_synthesized_event;
2077 if (rec->opts.synth & PERF_SYNTH_TASK) {
2078 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2080 err = __machine__synthesize_threads(machine, tool, &opts->target,
2081 rec->evlist->core.threads,
2082 f, needs_mmap, opts->sample_address,
2083 rec->opts.nr_threads_synthesize);
2086 if (rec->opts.nr_threads_synthesize > 1) {
2087 perf_set_singlethreaded();
2088 mutex_destroy(&synth_lock);
2095 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2097 struct record *rec = data;
2098 pthread_kill(rec->thread_id, SIGUSR2);
2102 static int record__setup_sb_evlist(struct record *rec)
2104 struct record_opts *opts = &rec->opts;
2106 if (rec->sb_evlist != NULL) {
2108 * We get here if --switch-output-event populated the
2109 * sb_evlist, so associate a callback that will send a SIGUSR2
2110 * to the main thread.
2112 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2113 rec->thread_id = pthread_self();
2115 #ifdef HAVE_LIBBPF_SUPPORT
2116 if (!opts->no_bpf_event) {
2117 if (rec->sb_evlist == NULL) {
2118 rec->sb_evlist = evlist__new();
2120 if (rec->sb_evlist == NULL) {
2121 pr_err("Couldn't create side band evlist.\n.");
2126 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2127 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2132 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2133 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2134 opts->no_bpf_event = true;
2140 static int record__init_clock(struct record *rec)
2142 struct perf_session *session = rec->session;
2143 struct timespec ref_clockid;
2144 struct timeval ref_tod;
2147 if (!rec->opts.use_clockid)
2150 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2151 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2153 session->header.env.clock.clockid = rec->opts.clockid;
2155 if (gettimeofday(&ref_tod, NULL) != 0) {
2156 pr_err("gettimeofday failed, cannot set reference time.\n");
2160 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2161 pr_err("clock_gettime failed, cannot set reference time.\n");
2165 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2166 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2168 session->header.env.clock.tod_ns = ref;
2170 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2171 (u64) ref_clockid.tv_nsec;
2173 session->header.env.clock.clockid_ns = ref;
2177 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2179 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2180 trigger_hit(&auxtrace_snapshot_trigger);
2181 auxtrace_record__snapshot_started = 1;
2182 if (auxtrace_record__snapshot_start(rec->itr))
2183 trigger_error(&auxtrace_snapshot_trigger);
2187 static void record__uniquify_name(struct record *rec)
2190 struct evlist *evlist = rec->evlist;
2194 if (perf_pmus__num_core_pmus() == 1)
2197 evlist__for_each_entry(evlist, pos) {
2198 if (!evsel__is_hybrid(pos))
2201 if (strchr(pos->name, '/'))
2204 ret = asprintf(&new_name, "%s/%s/",
2205 pos->pmu_name, pos->name);
2208 pos->name = new_name;
2213 static int record__terminate_thread(struct record_thread *thread_data)
2216 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2217 pid_t tid = thread_data->tid;
2219 close(thread_data->pipes.msg[1]);
2220 thread_data->pipes.msg[1] = -1;
2221 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2223 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2225 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2231 static int record__start_threads(struct record *rec)
2233 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2234 struct record_thread *thread_data = rec->thread_data;
2235 sigset_t full, mask;
2237 pthread_attr_t attrs;
2239 thread = &thread_data[0];
2241 if (!record__threads_enabled(rec))
2245 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2246 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2250 pthread_attr_init(&attrs);
2251 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2253 for (t = 1; t < nr_threads; t++) {
2254 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2256 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2257 pthread_attr_setaffinity_np(&attrs,
2258 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2259 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2261 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2262 for (tt = 1; tt < t; tt++)
2263 record__terminate_thread(&thread_data[t]);
2264 pr_err("Failed to start threads: %s\n", strerror(errno));
2269 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2271 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2272 thread_msg_tags[msg]);
2274 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2275 thread->tid, rec->thread_data[t].tid);
2278 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2279 (cpu_set_t *)thread->mask->affinity.bits);
2281 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2284 pthread_attr_destroy(&attrs);
2286 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2287 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2294 static int record__stop_threads(struct record *rec)
2297 struct record_thread *thread_data = rec->thread_data;
2299 for (t = 1; t < rec->nr_threads; t++)
2300 record__terminate_thread(&thread_data[t]);
2302 for (t = 0; t < rec->nr_threads; t++) {
2303 rec->samples += thread_data[t].samples;
2304 if (!record__threads_enabled(rec))
2306 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2307 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2308 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2309 thread_data[t].samples, thread_data[t].waking);
2310 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2311 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2312 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2314 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2320 static unsigned long record__waking(struct record *rec)
2323 unsigned long waking = 0;
2324 struct record_thread *thread_data = rec->thread_data;
2326 for (t = 0; t < rec->nr_threads; t++)
2327 waking += thread_data[t].waking;
2332 static int __cmd_record(struct record *rec, int argc, const char **argv)
2336 const bool forks = argc > 0;
2337 struct perf_tool *tool = &rec->tool;
2338 struct record_opts *opts = &rec->opts;
2339 struct perf_data *data = &rec->data;
2340 struct perf_session *session;
2341 bool disabled = false, draining = false;
2344 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2346 atexit(record__sig_exit);
2347 signal(SIGCHLD, sig_handler);
2348 signal(SIGINT, sig_handler);
2349 signal(SIGTERM, sig_handler);
2350 signal(SIGSEGV, sigsegv_handler);
2352 if (rec->opts.record_namespaces)
2353 tool->namespace_events = true;
2355 if (rec->opts.record_cgroup) {
2356 #ifdef HAVE_FILE_HANDLE
2357 tool->cgroup_events = true;
2359 pr_err("cgroup tracking is not supported\n");
2364 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2365 signal(SIGUSR2, snapshot_sig_handler);
2366 if (rec->opts.auxtrace_snapshot_mode)
2367 trigger_on(&auxtrace_snapshot_trigger);
2368 if (rec->switch_output.enabled)
2369 trigger_on(&switch_output_trigger);
2371 signal(SIGUSR2, SIG_IGN);
2374 session = perf_session__new(data, tool);
2375 if (IS_ERR(session)) {
2376 pr_err("Perf session creation failed.\n");
2377 return PTR_ERR(session);
2380 if (record__threads_enabled(rec)) {
2381 if (perf_data__is_pipe(&rec->data)) {
2382 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2385 if (rec->opts.full_auxtrace) {
2386 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2391 fd = perf_data__fd(data);
2392 rec->session = session;
2394 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2395 pr_err("Compression initialization failed.\n");
2398 #ifdef HAVE_EVENTFD_SUPPORT
2399 done_fd = eventfd(0, EFD_NONBLOCK);
2401 pr_err("Failed to create wakeup eventfd, error: %m\n");
2403 goto out_delete_session;
2405 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2407 pr_err("Failed to add wakeup eventfd to poll list\n");
2409 goto out_delete_session;
2411 #endif // HAVE_EVENTFD_SUPPORT
2413 session->header.env.comp_type = PERF_COMP_ZSTD;
2414 session->header.env.comp_level = rec->opts.comp_level;
2416 if (rec->opts.kcore &&
2417 !record__kcore_readable(&session->machines.host)) {
2418 pr_err("ERROR: kcore is not readable.\n");
2422 if (record__init_clock(rec))
2425 record__init_features(rec);
2428 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2429 workload_exec_failed_signal);
2431 pr_err("Couldn't run the workload!\n");
2433 goto out_delete_session;
2438 * If we have just single event and are sending data
2439 * through pipe, we need to force the ids allocation,
2440 * because we synthesize event name through the pipe
2441 * and need the id for that.
2443 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2444 rec->opts.sample_id = true;
2446 record__uniquify_name(rec);
2448 /* Debug message used by test scripts */
2449 pr_debug3("perf record opening and mmapping events\n");
2450 if (record__open(rec) != 0) {
2452 goto out_free_threads;
2454 /* Debug message used by test scripts */
2455 pr_debug3("perf record done opening and mmapping events\n");
2456 session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2458 if (rec->opts.kcore) {
2459 err = record__kcore_copy(&session->machines.host, data);
2461 pr_err("ERROR: Failed to copy kcore\n");
2462 goto out_free_threads;
2467 * Normally perf_session__new would do this, but it doesn't have the
2470 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2471 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2472 rec->tool.ordered_events = false;
2475 if (evlist__nr_groups(rec->evlist) == 0)
2476 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2478 if (data->is_pipe) {
2479 err = perf_header__write_pipe(fd);
2481 goto out_free_threads;
2483 err = perf_session__write_header(session, rec->evlist, fd, false);
2485 goto out_free_threads;
2489 if (!rec->no_buildid
2490 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2491 pr_err("Couldn't generate buildids. "
2492 "Use --no-buildid to profile anyway.\n");
2493 goto out_free_threads;
2496 err = record__setup_sb_evlist(rec);
2498 goto out_free_threads;
2500 err = record__synthesize(rec, false);
2502 goto out_free_threads;
2504 if (rec->realtime_prio) {
2505 struct sched_param param;
2507 param.sched_priority = rec->realtime_prio;
2508 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2509 pr_err("Could not set realtime priority.\n");
2511 goto out_free_threads;
2515 if (record__start_threads(rec))
2516 goto out_free_threads;
2519 * When perf is starting the traced process, all the events
2520 * (apart from group members) have enable_on_exec=1 set,
2521 * so don't spoil it by prematurely enabling them.
2523 if (!target__none(&opts->target) && !opts->target.initial_delay)
2524 evlist__enable(rec->evlist);
2530 struct machine *machine = &session->machines.host;
2531 union perf_event *event;
2534 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2535 if (event == NULL) {
2541 * Some H/W events are generated before COMM event
2542 * which is emitted during exec(), so perf script
2543 * cannot see a correct process name for those events.
2544 * Synthesize COMM event to prevent it.
2546 tgid = perf_event__synthesize_comm(tool, event,
2547 rec->evlist->workload.pid,
2548 process_synthesized_event,
2555 event = malloc(sizeof(event->namespaces) +
2556 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2557 machine->id_hdr_size);
2558 if (event == NULL) {
2564 * Synthesize NAMESPACES event for the command specified.
2566 perf_event__synthesize_namespaces(tool, event,
2567 rec->evlist->workload.pid,
2568 tgid, process_synthesized_event,
2572 evlist__start_workload(rec->evlist);
2575 if (opts->target.initial_delay) {
2576 pr_info(EVLIST_DISABLED_MSG);
2577 if (opts->target.initial_delay > 0) {
2578 usleep(opts->target.initial_delay * USEC_PER_MSEC);
2579 evlist__enable(rec->evlist);
2580 pr_info(EVLIST_ENABLED_MSG);
2584 err = event_enable_timer__start(rec->evlist->eet);
2588 /* Debug message used by test scripts */
2589 pr_debug3("perf record has started\n");
2592 trigger_ready(&auxtrace_snapshot_trigger);
2593 trigger_ready(&switch_output_trigger);
2594 perf_hooks__invoke_record_start();
2597 * Must write FINISHED_INIT so it will be seen after all other
2598 * synthesized user events, but before any regular events.
2600 err = write_finished_init(rec, false);
2605 unsigned long long hits = thread->samples;
2608 * rec->evlist->bkw_mmap_state is possible to be
2609 * BKW_MMAP_EMPTY here: when done == true and
2610 * hits != rec->samples in previous round.
2612 * evlist__toggle_bkw_mmap ensure we never
2613 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2615 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2616 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2618 if (record__mmap_read_all(rec, false) < 0) {
2619 trigger_error(&auxtrace_snapshot_trigger);
2620 trigger_error(&switch_output_trigger);
2625 if (auxtrace_record__snapshot_started) {
2626 auxtrace_record__snapshot_started = 0;
2627 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2628 record__read_auxtrace_snapshot(rec, false);
2629 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2630 pr_err("AUX area tracing snapshot failed\n");
2636 if (trigger_is_hit(&switch_output_trigger)) {
2638 * If switch_output_trigger is hit, the data in
2639 * overwritable ring buffer should have been collected,
2640 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2642 * If SIGUSR2 raise after or during record__mmap_read_all(),
2643 * record__mmap_read_all() didn't collect data from
2644 * overwritable ring buffer. Read again.
2646 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2648 trigger_ready(&switch_output_trigger);
2651 * Reenable events in overwrite ring buffer after
2652 * record__mmap_read_all(): we should have collected
2655 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2658 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2659 record__waking(rec));
2661 fd = record__switch_output(rec, false);
2663 pr_err("Failed to switch to new file\n");
2664 trigger_error(&switch_output_trigger);
2669 /* re-arm the alarm */
2670 if (rec->switch_output.time)
2671 alarm(rec->switch_output.time);
2674 if (hits == thread->samples) {
2675 if (done || draining)
2677 err = fdarray__poll(&thread->pollfd, -1);
2679 * Propagate error, only if there's any. Ignore positive
2680 * number of returned events and interrupt error.
2682 if (err > 0 || (err < 0 && errno == EINTR))
2686 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2687 record__thread_munmap_filtered, NULL) == 0)
2690 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2695 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2697 case EVLIST_CTL_CMD_SNAPSHOT:
2698 hit_auxtrace_snapshot_trigger(rec);
2699 evlist__ctlfd_ack(rec->evlist);
2701 case EVLIST_CTL_CMD_STOP:
2704 case EVLIST_CTL_CMD_ACK:
2705 case EVLIST_CTL_CMD_UNSUPPORTED:
2706 case EVLIST_CTL_CMD_ENABLE:
2707 case EVLIST_CTL_CMD_DISABLE:
2708 case EVLIST_CTL_CMD_EVLIST:
2709 case EVLIST_CTL_CMD_PING:
2715 err = event_enable_timer__process(rec->evlist->eet);
2724 * When perf is starting the traced process, at the end events
2725 * die with the process and we wait for that. Thus no need to
2726 * disable events in this case.
2728 if (done && !disabled && !target__none(&opts->target)) {
2729 trigger_off(&auxtrace_snapshot_trigger);
2730 evlist__disable(rec->evlist);
2735 trigger_off(&auxtrace_snapshot_trigger);
2736 trigger_off(&switch_output_trigger);
2738 if (opts->auxtrace_snapshot_on_exit)
2739 record__auxtrace_snapshot_exit(rec);
2741 if (forks && workload_exec_errno) {
2742 char msg[STRERR_BUFSIZE], strevsels[2048];
2743 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2745 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2747 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2748 strevsels, argv[0], emsg);
2754 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2755 record__waking(rec));
2757 write_finished_init(rec, true);
2759 if (target__none(&rec->opts.target))
2760 record__synthesize_workload(rec, true);
2763 record__stop_threads(rec);
2764 record__mmap_read_all(rec, true);
2766 record__free_thread_data(rec);
2767 evlist__finalize_ctlfd(rec->evlist);
2768 record__aio_mmap_read_sync(rec);
2770 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2771 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2772 session->header.env.comp_ratio = ratio + 0.5;
2778 if (!child_finished)
2779 kill(rec->evlist->workload.pid, SIGTERM);
2785 else if (WIFEXITED(exit_status))
2786 status = WEXITSTATUS(exit_status);
2787 else if (WIFSIGNALED(exit_status))
2788 signr = WTERMSIG(exit_status);
2793 rec->bytes_written += off_cpu_write(rec->session);
2795 record__read_lost_samples(rec);
2796 record__synthesize(rec, true);
2797 /* this will be recalculated during process_buildids() */
2801 if (!rec->timestamp_filename) {
2802 record__finish_output(rec);
2804 fd = record__switch_output(rec, true);
2807 goto out_delete_session;
2812 perf_hooks__invoke_record_end();
2814 if (!err && !quiet) {
2816 const char *postfix = rec->timestamp_filename ?
2817 ".<timestamp>" : "";
2819 if (rec->samples && !rec->opts.full_auxtrace)
2820 scnprintf(samples, sizeof(samples),
2821 " (%" PRIu64 " samples)", rec->samples);
2825 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2826 perf_data__size(data) / 1024.0 / 1024.0,
2827 data->path, postfix, samples);
2829 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2830 rec->session->bytes_transferred / 1024.0 / 1024.0,
2833 fprintf(stderr, " ]\n");
2837 #ifdef HAVE_EVENTFD_SUPPORT
2845 zstd_fini(&session->zstd_data);
2846 perf_session__delete(session);
2848 if (!opts->no_bpf_event)
2849 evlist__stop_sb_thread(rec->sb_evlist);
2853 static void callchain_debug(struct callchain_param *callchain)
2855 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2857 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2859 if (callchain->record_mode == CALLCHAIN_DWARF)
2860 pr_debug("callchain: stack dump size %d\n",
2861 callchain->dump_size);
2864 int record_opts__parse_callchain(struct record_opts *record,
2865 struct callchain_param *callchain,
2866 const char *arg, bool unset)
2869 callchain->enabled = !unset;
2871 /* --no-call-graph */
2873 callchain->record_mode = CALLCHAIN_NONE;
2874 pr_debug("callchain: disabled\n");
2878 ret = parse_callchain_record_opt(arg, callchain);
2880 /* Enable data address sampling for DWARF unwind. */
2881 if (callchain->record_mode == CALLCHAIN_DWARF)
2882 record->sample_address = true;
2883 callchain_debug(callchain);
2889 int record_parse_callchain_opt(const struct option *opt,
2893 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2896 int record_callchain_opt(const struct option *opt,
2897 const char *arg __maybe_unused,
2898 int unset __maybe_unused)
2900 struct callchain_param *callchain = opt->value;
2902 callchain->enabled = true;
2904 if (callchain->record_mode == CALLCHAIN_NONE)
2905 callchain->record_mode = CALLCHAIN_FP;
2907 callchain_debug(callchain);
2911 static int perf_record_config(const char *var, const char *value, void *cb)
2913 struct record *rec = cb;
2915 if (!strcmp(var, "record.build-id")) {
2916 if (!strcmp(value, "cache"))
2917 rec->no_buildid_cache = false;
2918 else if (!strcmp(value, "no-cache"))
2919 rec->no_buildid_cache = true;
2920 else if (!strcmp(value, "skip"))
2921 rec->no_buildid = true;
2922 else if (!strcmp(value, "mmap"))
2923 rec->buildid_mmap = true;
2928 if (!strcmp(var, "record.call-graph")) {
2929 var = "call-graph.record-mode";
2930 return perf_default_config(var, value, cb);
2932 #ifdef HAVE_AIO_SUPPORT
2933 if (!strcmp(var, "record.aio")) {
2934 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2935 if (!rec->opts.nr_cblocks)
2936 rec->opts.nr_cblocks = nr_cblocks_default;
2939 if (!strcmp(var, "record.debuginfod")) {
2940 rec->debuginfod.urls = strdup(value);
2941 if (!rec->debuginfod.urls)
2943 rec->debuginfod.set = true;
2949 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2951 struct record *rec = (struct record *)opt->value;
2953 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2956 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2958 struct record_opts *opts = (struct record_opts *)opt->value;
2963 if (!strcasecmp(str, "node"))
2964 opts->affinity = PERF_AFFINITY_NODE;
2965 else if (!strcasecmp(str, "cpu"))
2966 opts->affinity = PERF_AFFINITY_CPU;
2971 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2973 mask->nbits = nr_bits;
2974 mask->bits = bitmap_zalloc(mask->nbits);
2981 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2983 bitmap_free(mask->bits);
2987 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2991 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2993 mask->affinity.bits = NULL;
2997 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2999 record__mmap_cpu_mask_free(&mask->maps);
3000 mask->maps.bits = NULL;
3006 static void record__thread_mask_free(struct thread_mask *mask)
3008 record__mmap_cpu_mask_free(&mask->maps);
3009 record__mmap_cpu_mask_free(&mask->affinity);
3012 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3015 struct record_opts *opts = opt->value;
3017 if (unset || !str || !strlen(str)) {
3018 opts->threads_spec = THREAD_SPEC__CPU;
3020 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3021 if (s == THREAD_SPEC__USER) {
3022 opts->threads_user_spec = strdup(str);
3023 if (!opts->threads_user_spec)
3025 opts->threads_spec = THREAD_SPEC__USER;
3028 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3029 opts->threads_spec = s;
3035 if (opts->threads_spec == THREAD_SPEC__USER)
3036 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3038 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3043 static int parse_output_max_size(const struct option *opt,
3044 const char *str, int unset)
3046 unsigned long *s = (unsigned long *)opt->value;
3047 static struct parse_tag tags_size[] = {
3048 { .tag = 'B', .mult = 1 },
3049 { .tag = 'K', .mult = 1 << 10 },
3050 { .tag = 'M', .mult = 1 << 20 },
3051 { .tag = 'G', .mult = 1 << 30 },
3061 val = parse_tag_value(str, tags_size);
3062 if (val != (unsigned long) -1) {
3070 static int record__parse_mmap_pages(const struct option *opt,
3072 int unset __maybe_unused)
3074 struct record_opts *opts = opt->value;
3076 unsigned int mmap_pages;
3091 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3094 opts->mmap_pages = mmap_pages;
3102 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3106 opts->auxtrace_mmap_pages = mmap_pages;
3113 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3117 static int parse_control_option(const struct option *opt,
3119 int unset __maybe_unused)
3121 struct record_opts *opts = opt->value;
3123 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3126 static void switch_output_size_warn(struct record *rec)
3128 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3129 struct switch_output *s = &rec->switch_output;
3133 if (s->size < wakeup_size) {
3136 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3137 pr_warning("WARNING: switch-output data size lower than "
3138 "wakeup kernel buffer size (%s) "
3139 "expect bigger perf.data sizes\n", buf);
3143 static int switch_output_setup(struct record *rec)
3145 struct switch_output *s = &rec->switch_output;
3146 static struct parse_tag tags_size[] = {
3147 { .tag = 'B', .mult = 1 },
3148 { .tag = 'K', .mult = 1 << 10 },
3149 { .tag = 'M', .mult = 1 << 20 },
3150 { .tag = 'G', .mult = 1 << 30 },
3153 static struct parse_tag tags_time[] = {
3154 { .tag = 's', .mult = 1 },
3155 { .tag = 'm', .mult = 60 },
3156 { .tag = 'h', .mult = 60*60 },
3157 { .tag = 'd', .mult = 60*60*24 },
3163 * If we're using --switch-output-events, then we imply its
3164 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3165 * thread to its parent.
3167 if (rec->switch_output_event_set) {
3168 if (record__threads_enabled(rec)) {
3169 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3178 if (record__threads_enabled(rec)) {
3179 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3183 if (!strcmp(s->str, "signal")) {
3186 pr_debug("switch-output with SIGUSR2 signal\n");
3190 val = parse_tag_value(s->str, tags_size);
3191 if (val != (unsigned long) -1) {
3193 pr_debug("switch-output with %s size threshold\n", s->str);
3197 val = parse_tag_value(s->str, tags_time);
3198 if (val != (unsigned long) -1) {
3200 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3208 rec->timestamp_filename = true;
3211 if (s->size && !rec->opts.no_buffering)
3212 switch_output_size_warn(rec);
3217 static const char * const __record_usage[] = {
3218 "perf record [<options>] [<command>]",
3219 "perf record [<options>] -- <command> [<options>]",
3222 const char * const *record_usage = __record_usage;
3224 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3225 struct perf_sample *sample, struct machine *machine)
3228 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3229 * no need to add them twice.
3231 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3233 return perf_event__process_mmap(tool, event, sample, machine);
3236 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3237 struct perf_sample *sample, struct machine *machine)
3240 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3241 * no need to add them twice.
3243 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3246 return perf_event__process_mmap2(tool, event, sample, machine);
3249 static int process_timestamp_boundary(struct perf_tool *tool,
3250 union perf_event *event __maybe_unused,
3251 struct perf_sample *sample,
3252 struct machine *machine __maybe_unused)
3254 struct record *rec = container_of(tool, struct record, tool);
3256 set_timestamp_boundary(rec, sample->time);
3260 static int parse_record_synth_option(const struct option *opt,
3262 int unset __maybe_unused)
3264 struct record_opts *opts = opt->value;
3265 char *p = strdup(str);
3270 opts->synth = parse_synth_opt(p);
3273 if (opts->synth < 0) {
3274 pr_err("Invalid synth option: %s\n", str);
3281 * XXX Ideally would be local to cmd_record() and passed to a record__new
3282 * because we need to have access to it in record__exit, that is called
3283 * after cmd_record() exits, but since record_options need to be accessible to
3284 * builtin-script, leave it here.
3286 * At least we don't ouch it in all the other functions here directly.
3288 * Just say no to tons of global variables, sigh.
3290 static struct record record = {
3292 .sample_time = true,
3293 .mmap_pages = UINT_MAX,
3294 .user_freq = UINT_MAX,
3295 .user_interval = ULLONG_MAX,
3299 .default_per_cpu = true,
3301 .mmap_flush = MMAP_FLUSH_DEFAULT,
3302 .nr_threads_synthesize = 1,
3305 .synth = PERF_SYNTH_ALL,
3308 .sample = process_sample_event,
3309 .fork = perf_event__process_fork,
3310 .exit = perf_event__process_exit,
3311 .comm = perf_event__process_comm,
3312 .namespaces = perf_event__process_namespaces,
3313 .mmap = build_id__process_mmap,
3314 .mmap2 = build_id__process_mmap2,
3315 .itrace_start = process_timestamp_boundary,
3316 .aux = process_timestamp_boundary,
3317 .ordered_events = true,
3321 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3322 "\n\t\t\t\tDefault: fp";
3324 static bool dry_run;
3326 static struct parse_events_option_args parse_events_option_args = {
3327 .evlistp = &record.evlist,
3330 static struct parse_events_option_args switch_output_parse_events_option_args = {
3331 .evlistp = &record.sb_evlist,
3335 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3336 * with it and switch to use the library functions in perf_evlist that came
3337 * from builtin-record.c, i.e. use record_opts,
3338 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3341 static struct option __record_options[] = {
3342 OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3343 "event selector. use 'perf list' to list available events",
3344 parse_events_option),
3345 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3346 "event filter", parse_filter),
3347 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3348 NULL, "don't record events from perf itself",
3350 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3351 "record events on existing process id"),
3352 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3353 "record events on existing thread id"),
3354 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3355 "collect data with this RT SCHED_FIFO priority"),
3356 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3357 "collect data without buffering"),
3358 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3359 "collect raw sample records from all opened counters"),
3360 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3361 "system-wide collection from all CPUs"),
3362 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3363 "list of cpus to monitor"),
3364 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3365 OPT_STRING('o', "output", &record.data.path, "file",
3366 "output file name"),
3367 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3368 &record.opts.no_inherit_set,
3369 "child tasks do not inherit counters"),
3370 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3371 "synthesize non-sample events at the end of output"),
3372 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3373 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3374 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3375 "Fail if the specified frequency can't be used"),
3376 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3377 "profile at this frequency",
3378 record__parse_freq),
3379 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3380 "number of mmap data pages and AUX area tracing mmap pages",
3381 record__parse_mmap_pages),
3382 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3383 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3384 record__mmap_flush_parse),
3385 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3386 NULL, "enables call-graph recording" ,
3387 &record_callchain_opt),
3388 OPT_CALLBACK(0, "call-graph", &record.opts,
3389 "record_mode[,record_size]", record_callchain_help,
3390 &record_parse_callchain_opt),
3391 OPT_INCR('v', "verbose", &verbose,
3392 "be more verbose (show counter open errors, etc)"),
3393 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3394 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3395 "per thread counts"),
3396 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3397 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3398 "Record the sample physical addresses"),
3399 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3400 "Record the sampled data address data page size"),
3401 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3402 "Record the sampled code address (ip) page size"),
3403 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3404 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3405 "Record the sample identifier"),
3406 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3407 &record.opts.sample_time_set,
3408 "Record the sample timestamps"),
3409 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3410 "Record the sample period"),
3411 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3413 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3414 &record.no_buildid_cache_set,
3415 "do not update the buildid cache"),
3416 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3417 &record.no_buildid_set,
3418 "do not collect buildids in perf.data"),
3419 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3420 "monitor event in cgroup name only",
3422 OPT_CALLBACK('D', "delay", &record, "ms",
3423 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3424 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3425 record__parse_event_enable_time),
3426 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3427 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3430 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3431 "branch any", "sample any taken branches",
3432 parse_branch_stack),
3434 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3435 "branch filter mask", "branch stack filter modes",
3436 parse_branch_stack),
3437 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3438 "sample by weight (on special events only)"),
3439 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3440 "sample transaction flags (special events only)"),
3441 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3442 "use per-thread mmaps"),
3443 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3444 "sample selected machine registers on interrupt,"
3445 " use '-I?' to list register names", parse_intr_regs),
3446 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3447 "sample selected machine registers on interrupt,"
3448 " use '--user-regs=?' to list register names", parse_user_regs),
3449 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3450 "Record running/enabled time of read (:S) events"),
3451 OPT_CALLBACK('k', "clockid", &record.opts,
3452 "clockid", "clockid to use for events, see clock_gettime()",
3454 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3455 "opts", "AUX area tracing Snapshot Mode", ""),
3456 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3457 "opts", "sample AUX area", ""),
3458 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3459 "per thread proc mmap processing timeout in ms"),
3460 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3461 "Record namespaces events"),
3462 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3463 "Record cgroup events"),
3464 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3465 &record.opts.record_switch_events_set,
3466 "Record context switch events"),
3467 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3468 "Configure all used events to run in kernel space.",
3469 PARSE_OPT_EXCLUSIVE),
3470 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3471 "Configure all used events to run in user space.",
3472 PARSE_OPT_EXCLUSIVE),
3473 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3474 "collect kernel callchains"),
3475 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3476 "collect user callchains"),
3477 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3478 "file", "vmlinux pathname"),
3479 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3480 "Record build-id of all DSOs regardless of hits"),
3481 OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3482 "Record build-id in map events"),
3483 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3484 "append timestamp to output filename"),
3485 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3486 "Record timestamp boundary (time of first/last samples)"),
3487 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3488 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3489 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3491 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3492 &record.switch_output_event_set, "switch output event",
3493 "switch output event selector. use 'perf list' to list available events",
3494 parse_events_option_new_evlist),
3495 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3496 "Limit number of switch output generated files"),
3497 OPT_BOOLEAN(0, "dry-run", &dry_run,
3498 "Parse options then exit"),
3499 #ifdef HAVE_AIO_SUPPORT
3500 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3501 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3504 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3505 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3506 record__parse_affinity),
3507 #ifdef HAVE_ZSTD_SUPPORT
3508 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3509 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3510 record__parse_comp_level),
3512 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3513 "size", "Limit the maximum size of the output file", parse_output_max_size),
3514 OPT_UINTEGER(0, "num-thread-synthesize",
3515 &record.opts.nr_threads_synthesize,
3516 "number of threads to run for event synthesis"),
3518 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3519 "libpfm4 event selector. use 'perf list' to list available events",
3520 parse_libpfm_events_option),
3522 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3523 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3524 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3525 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3526 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3527 parse_control_option),
3528 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3529 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3530 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3531 &record.debuginfod.set, "debuginfod urls",
3532 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3534 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3535 "write collected trace data into several data files using parallel threads",
3536 record__parse_threads),
3537 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3541 struct option *record_options = __record_options;
3543 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3545 struct perf_cpu cpu;
3548 if (cpu_map__is_dummy(cpus))
3551 perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3554 /* Return ENODEV is input cpu is greater than max cpu */
3555 if ((unsigned long)cpu.cpu > mask->nbits)
3557 __set_bit(cpu.cpu, mask->bits);
3563 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3565 struct perf_cpu_map *cpus;
3567 cpus = perf_cpu_map__new(mask_spec);
3571 bitmap_zero(mask->bits, mask->nbits);
3572 if (record__mmap_cpu_mask_init(mask, cpus))
3575 perf_cpu_map__put(cpus);
3580 static void record__free_thread_masks(struct record *rec, int nr_threads)
3584 if (rec->thread_masks)
3585 for (t = 0; t < nr_threads; t++)
3586 record__thread_mask_free(&rec->thread_masks[t]);
3588 zfree(&rec->thread_masks);
3591 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3595 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3596 if (!rec->thread_masks) {
3597 pr_err("Failed to allocate thread masks\n");
3601 for (t = 0; t < nr_threads; t++) {
3602 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3604 pr_err("Failed to allocate thread masks[%d]\n", t);
3612 record__free_thread_masks(rec, nr_threads);
3617 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3619 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3621 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3625 rec->nr_threads = nr_cpus;
3626 pr_debug("nr_threads: %d\n", rec->nr_threads);
3628 for (t = 0; t < rec->nr_threads; t++) {
3629 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3630 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3632 pr_debug("thread_masks[%d]: ", t);
3633 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3634 pr_debug("thread_masks[%d]: ", t);
3635 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3642 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3643 const char **maps_spec, const char **affinity_spec,
3648 struct mmap_cpu_mask cpus_mask;
3649 struct thread_mask thread_mask, full_mask, *thread_masks;
3651 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3653 pr_err("Failed to allocate CPUs mask\n");
3657 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3659 pr_err("Failed to init cpu mask\n");
3660 goto out_free_cpu_mask;
3663 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3665 pr_err("Failed to allocate full mask\n");
3666 goto out_free_cpu_mask;
3669 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3671 pr_err("Failed to allocate thread mask\n");
3672 goto out_free_full_and_cpu_masks;
3675 for (s = 0; s < nr_spec; s++) {
3676 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3678 pr_err("Failed to initialize maps thread mask\n");
3681 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3683 pr_err("Failed to initialize affinity thread mask\n");
3687 /* ignore invalid CPUs but do not allow empty masks */
3688 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3689 cpus_mask.bits, thread_mask.maps.nbits)) {
3690 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3694 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3695 cpus_mask.bits, thread_mask.affinity.nbits)) {
3696 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3701 /* do not allow intersection with other masks (full_mask) */
3702 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3703 thread_mask.maps.nbits)) {
3704 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3708 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3709 thread_mask.affinity.nbits)) {
3710 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3715 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3716 thread_mask.maps.bits, full_mask.maps.nbits);
3717 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3718 thread_mask.affinity.bits, full_mask.maps.nbits);
3720 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3721 if (!thread_masks) {
3722 pr_err("Failed to reallocate thread masks\n");
3726 rec->thread_masks = thread_masks;
3727 rec->thread_masks[t] = thread_mask;
3729 pr_debug("thread_masks[%d]: ", t);
3730 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3731 pr_debug("thread_masks[%d]: ", t);
3732 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3735 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3737 pr_err("Failed to allocate thread mask\n");
3738 goto out_free_full_and_cpu_masks;
3741 rec->nr_threads = t;
3742 pr_debug("nr_threads: %d\n", rec->nr_threads);
3743 if (!rec->nr_threads)
3747 record__thread_mask_free(&thread_mask);
3748 out_free_full_and_cpu_masks:
3749 record__thread_mask_free(&full_mask);
3751 record__mmap_cpu_mask_free(&cpus_mask);
3756 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3759 struct cpu_topology *topo;
3761 topo = cpu_topology__new();
3763 pr_err("Failed to allocate CPU topology\n");
3767 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3768 topo->core_cpus_list, topo->core_cpus_lists);
3769 cpu_topology__delete(topo);
3774 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3777 struct cpu_topology *topo;
3779 topo = cpu_topology__new();
3781 pr_err("Failed to allocate CPU topology\n");
3785 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3786 topo->package_cpus_list, topo->package_cpus_lists);
3787 cpu_topology__delete(topo);
3792 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3797 struct numa_topology *topo;
3799 topo = numa_topology__new();
3801 pr_err("Failed to allocate NUMA topology\n");
3805 spec = zalloc(topo->nr * sizeof(char *));
3807 pr_err("Failed to allocate NUMA spec\n");
3809 goto out_delete_topo;
3811 for (s = 0; s < topo->nr; s++)
3812 spec[s] = topo->nodes[s].cpus;
3814 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3819 numa_topology__delete(topo);
3824 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3828 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3829 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3831 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3832 spec = strtok_r(user_spec, ":", &spec_ptr);
3835 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3836 mask = strtok_r(spec, "/", &mask_ptr);
3839 pr_debug2(" maps mask: %s\n", mask);
3840 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3842 pr_err("Failed to reallocate maps spec\n");
3846 maps_spec = tmp_spec;
3847 maps_spec[nr_spec] = dup_mask = strdup(mask);
3848 if (!maps_spec[nr_spec]) {
3849 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3853 mask = strtok_r(NULL, "/", &mask_ptr);
3855 pr_err("Invalid thread maps or affinity specs\n");
3859 pr_debug2(" affinity mask: %s\n", mask);
3860 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3862 pr_err("Failed to reallocate affinity spec\n");
3866 affinity_spec = tmp_spec;
3867 affinity_spec[nr_spec] = strdup(mask);
3868 if (!affinity_spec[nr_spec]) {
3869 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3877 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3878 (const char **)affinity_spec, nr_spec);
3882 for (s = 0; s < nr_spec; s++) {
3886 free(affinity_spec[s]);
3888 free(affinity_spec);
3894 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3898 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3902 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3905 rec->nr_threads = 1;
3910 static int record__init_thread_masks(struct record *rec)
3913 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3915 if (!record__threads_enabled(rec))
3916 return record__init_thread_default_masks(rec, cpus);
3918 if (evlist__per_thread(rec->evlist)) {
3919 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3923 switch (rec->opts.threads_spec) {
3924 case THREAD_SPEC__CPU:
3925 ret = record__init_thread_cpu_masks(rec, cpus);
3927 case THREAD_SPEC__CORE:
3928 ret = record__init_thread_core_masks(rec, cpus);
3930 case THREAD_SPEC__PACKAGE:
3931 ret = record__init_thread_package_masks(rec, cpus);
3933 case THREAD_SPEC__NUMA:
3934 ret = record__init_thread_numa_masks(rec, cpus);
3936 case THREAD_SPEC__USER:
3937 ret = record__init_thread_user_masks(rec, cpus);
3946 int cmd_record(int argc, const char **argv)
3949 struct record *rec = &record;
3950 char errbuf[BUFSIZ];
3952 setlocale(LC_ALL, "");
3954 #ifndef HAVE_BPF_SKEL
3955 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3956 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3960 rec->opts.affinity = PERF_AFFINITY_SYS;
3962 rec->evlist = evlist__new();
3963 if (rec->evlist == NULL)
3966 err = perf_config(perf_record_config, rec);
3970 argc = parse_options(argc, argv, record_options, record_usage,
3971 PARSE_OPT_STOP_AT_NON_OPTION);
3973 perf_quiet_option();
3975 err = symbol__validate_sym_arguments();
3979 perf_debuginfod_setup(&record.debuginfod);
3981 /* Make system wide (-a) the default target. */
3982 if (!argc && target__none(&rec->opts.target))
3983 rec->opts.target.system_wide = true;
3985 if (nr_cgroups && !rec->opts.target.system_wide) {
3986 usage_with_options_msg(record_usage, record_options,
3987 "cgroup monitoring only available in system-wide mode");
3991 if (rec->buildid_mmap) {
3992 if (!perf_can_record_build_id()) {
3993 pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3997 pr_debug("Enabling build id in mmap2 events.\n");
3998 /* Enable mmap build id synthesizing. */
3999 symbol_conf.buildid_mmap2 = true;
4000 /* Enable perf_event_attr::build_id bit. */
4001 rec->opts.build_id = true;
4002 /* Disable build id cache. */
4003 rec->no_buildid = true;
4006 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4007 pr_err("Kernel has no cgroup sampling support.\n");
4012 if (rec->opts.kcore)
4013 rec->opts.text_poke = true;
4015 if (rec->opts.kcore || record__threads_enabled(rec))
4016 rec->data.is_dir = true;
4018 if (record__threads_enabled(rec)) {
4019 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4020 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4023 if (record__aio_enabled(rec)) {
4024 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4029 if (rec->opts.comp_level != 0) {
4030 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4031 rec->no_buildid = true;
4034 if (rec->opts.record_switch_events &&
4035 !perf_can_record_switch_events()) {
4036 ui__error("kernel does not support recording context switch events\n");
4037 parse_options_usage(record_usage, record_options, "switch-events", 0);
4042 if (switch_output_setup(rec)) {
4043 parse_options_usage(record_usage, record_options, "switch-output", 0);
4048 if (rec->switch_output.time) {
4049 signal(SIGALRM, alarm_sig_handler);
4050 alarm(rec->switch_output.time);
4053 if (rec->switch_output.num_files) {
4054 rec->switch_output.filenames = calloc(sizeof(char *),
4055 rec->switch_output.num_files);
4056 if (!rec->switch_output.filenames) {
4062 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4063 rec->timestamp_filename = false;
4064 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4068 * Allow aliases to facilitate the lookup of symbols for address
4069 * filters. Refer to auxtrace_parse_filters().
4071 symbol_conf.allow_aliases = true;
4075 err = record__auxtrace_init(rec);
4084 if (rec->no_buildid_cache || rec->no_buildid) {
4085 disable_buildid_cache();
4086 } else if (rec->switch_output.enabled) {
4088 * In 'perf record --switch-output', disable buildid
4089 * generation by default to reduce data file switching
4090 * overhead. Still generate buildid if they are required
4093 * perf record --switch-output --no-no-buildid \
4094 * --no-no-buildid-cache
4096 * Following code equals to:
4098 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4099 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4100 * disable_buildid_cache();
4102 bool disable = true;
4104 if (rec->no_buildid_set && !rec->no_buildid)
4106 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4109 rec->no_buildid = true;
4110 rec->no_buildid_cache = true;
4111 disable_buildid_cache();
4115 if (record.opts.overwrite)
4116 record.opts.tail_synthesize = true;
4118 if (rec->evlist->core.nr_entries == 0) {
4119 bool can_profile_kernel = perf_event_paranoid_check(1);
4121 err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4126 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4127 rec->opts.no_inherit = true;
4129 err = target__validate(&rec->opts.target);
4131 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4132 ui__warning("%s\n", errbuf);
4135 err = target__parse_uid(&rec->opts.target);
4137 int saved_errno = errno;
4139 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4140 ui__error("%s", errbuf);
4146 /* Enable ignoring missing threads when -u/-p option is defined. */
4147 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4149 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4151 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4152 arch__add_leaf_frame_record_opts(&rec->opts);
4155 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4156 if (rec->opts.target.pid != NULL) {
4157 pr_err("Couldn't create thread/CPU maps: %s\n",
4158 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4162 usage_with_options(record_usage, record_options);
4165 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4170 * We take all buildids when the file contains
4171 * AUX area tracing data because we do not decode the
4172 * trace because it would take too long.
4174 if (rec->opts.full_auxtrace)
4175 rec->buildid_all = true;
4177 if (rec->opts.text_poke) {
4178 err = record__config_text_poke(rec->evlist);
4180 pr_err("record__config_text_poke failed, error %d\n", err);
4186 err = record__config_off_cpu(rec);
4188 pr_err("record__config_off_cpu failed, error %d\n", err);
4193 if (record_opts__config(&rec->opts)) {
4198 err = record__init_thread_masks(rec);
4200 pr_err("Failed to initialize parallel data streaming masks\n");
4204 if (rec->opts.nr_cblocks > nr_cblocks_max)
4205 rec->opts.nr_cblocks = nr_cblocks_max;
4206 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4208 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4209 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4211 if (rec->opts.comp_level > comp_level_max)
4212 rec->opts.comp_level = comp_level_max;
4213 pr_debug("comp level: %d\n", rec->opts.comp_level);
4215 err = __cmd_record(&record, argc, argv);
4217 evlist__delete(rec->evlist);
4219 auxtrace_record__free(rec->itr);
4221 record__free_thread_masks(rec, rec->nr_threads);
4222 rec->nr_threads = 0;
4223 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4227 static void snapshot_sig_handler(int sig __maybe_unused)
4229 struct record *rec = &record;
4231 hit_auxtrace_snapshot_trigger(rec);
4233 if (switch_output_signal(rec))
4234 trigger_hit(&switch_output_trigger);
4237 static void alarm_sig_handler(int sig __maybe_unused)
4239 struct record *rec = &record;
4241 if (switch_output_time(rec))
4242 trigger_hit(&switch_output_trigger);