1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
75 #include <sys/types.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
85 struct switch_output {
98 struct mmap_cpu_mask maps;
99 struct mmap_cpu_mask affinity;
102 struct record_thread {
104 struct thread_mask *mask;
109 struct fdarray pollfd;
113 struct mmap **overwrite_maps;
115 unsigned long long samples;
116 unsigned long waking;
118 u64 bytes_transferred;
119 u64 bytes_compressed;
122 static __thread struct record_thread *thread;
125 THREAD_MSG__UNDEFINED = 0,
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
135 THREAD_SPEC__UNDEFINED = 0,
138 THREAD_SPEC__PACKAGE,
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 "undefined", "cpu", "core", "package", "numa", "user"
148 struct pollfd_index_map {
149 int evlist_pollfd_index;
150 int thread_pollfd_index;
154 struct perf_tool tool;
155 struct record_opts opts;
157 u64 thread_bytes_written;
158 struct perf_data data;
159 struct auxtrace_record *itr;
160 struct evlist *evlist;
161 struct perf_session *session;
162 struct evlist *sb_evlist;
165 bool switch_output_event_set;
168 bool no_buildid_cache;
169 bool no_buildid_cache_set;
172 bool timestamp_filename;
173 bool timestamp_boundary;
175 struct switch_output switch_output;
176 unsigned long long samples;
177 unsigned long output_max_size; /* = 0: unlimited */
178 struct perf_debuginfod debuginfod;
180 struct thread_mask *thread_masks;
181 struct record_thread *thread_data;
182 struct pollfd_index_map *index_map;
184 size_t index_map_cnt;
187 static volatile int done;
189 static volatile int auxtrace_record__snapshot_started;
190 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
191 static DEFINE_TRIGGER(switch_output_trigger);
193 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
198 static inline pid_t gettid(void)
200 return (pid_t)syscall(__NR_gettid);
204 static int record__threads_enabled(struct record *rec)
206 return rec->opts.threads_spec;
209 static bool switch_output_signal(struct record *rec)
211 return rec->switch_output.signal &&
212 trigger_is_ready(&switch_output_trigger);
215 static bool switch_output_size(struct record *rec)
217 return rec->switch_output.size &&
218 trigger_is_ready(&switch_output_trigger) &&
219 (rec->bytes_written >= rec->switch_output.size);
222 static bool switch_output_time(struct record *rec)
224 return rec->switch_output.time &&
225 trigger_is_ready(&switch_output_trigger);
228 static u64 record__bytes_written(struct record *rec)
230 return rec->bytes_written + rec->thread_bytes_written;
233 static bool record__output_max_size_exceeded(struct record *rec)
235 return rec->output_max_size &&
236 (record__bytes_written(rec) >= rec->output_max_size);
239 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
240 void *bf, size_t size)
242 struct perf_data_file *file = &rec->session->data->file;
244 if (map && map->file)
247 if (perf_data_file__write(file, bf, size) < 0) {
248 pr_err("failed to write perf data, error: %m\n");
252 if (map && map->file) {
253 thread->bytes_written += size;
254 rec->thread_bytes_written += size;
256 rec->bytes_written += size;
259 if (record__output_max_size_exceeded(rec) && !done) {
260 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
261 " stopping session ]\n",
262 record__bytes_written(rec) >> 10);
266 if (switch_output_size(rec))
267 trigger_hit(&switch_output_trigger);
272 static int record__aio_enabled(struct record *rec);
273 static int record__comp_enabled(struct record *rec);
274 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
275 void *dst, size_t dst_size, void *src, size_t src_size);
277 #ifdef HAVE_AIO_SUPPORT
278 static int record__aio_write(struct aiocb *cblock, int trace_fd,
279 void *buf, size_t size, off_t off)
283 cblock->aio_fildes = trace_fd;
284 cblock->aio_buf = buf;
285 cblock->aio_nbytes = size;
286 cblock->aio_offset = off;
287 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
290 rc = aio_write(cblock);
293 } else if (errno != EAGAIN) {
294 cblock->aio_fildes = -1;
295 pr_err("failed to queue perf data, error: %m\n");
303 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
309 ssize_t aio_ret, written;
311 aio_errno = aio_error(cblock);
312 if (aio_errno == EINPROGRESS)
315 written = aio_ret = aio_return(cblock);
317 if (aio_errno != EINTR)
318 pr_err("failed to write perf data, error: %m\n");
322 rem_size = cblock->aio_nbytes - written;
325 cblock->aio_fildes = -1;
327 * md->refcount is incremented in record__aio_pushfn() for
328 * every aio write request started in record__aio_push() so
329 * decrement it because the request is now complete.
331 perf_mmap__put(&md->core);
335 * aio write request may require restart with the
336 * reminder if the kernel didn't write whole
339 rem_off = cblock->aio_offset + written;
340 rem_buf = (void *)(cblock->aio_buf + written);
341 record__aio_write(cblock, cblock->aio_fildes,
342 rem_buf, rem_size, rem_off);
349 static int record__aio_sync(struct mmap *md, bool sync_all)
351 struct aiocb **aiocb = md->aio.aiocb;
352 struct aiocb *cblocks = md->aio.cblocks;
353 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
358 for (i = 0; i < md->aio.nr_cblocks; ++i) {
359 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
366 * Started aio write is not complete yet
367 * so it has to be waited before the
370 aiocb[i] = &cblocks[i];
377 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
378 if (!(errno == EAGAIN || errno == EINTR))
379 pr_err("failed to sync perf data, error: %m\n");
390 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
392 struct record_aio *aio = to;
395 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
396 * to release space in the kernel buffer as fast as possible, calling
397 * perf_mmap__consume() from perf_mmap__push() function.
399 * That lets the kernel to proceed with storing more profiling data into
400 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
402 * Coping can be done in two steps in case the chunk of profiling data
403 * crosses the upper bound of the kernel buffer. In this case we first move
404 * part of data from map->start till the upper bound and then the reminder
405 * from the beginning of the kernel buffer till the end of the data chunk.
408 if (record__comp_enabled(aio->rec)) {
409 size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
410 mmap__mmap_len(map) - aio->size,
413 memcpy(aio->data + aio->size, buf, size);
418 * Increment map->refcount to guard map->aio.data[] buffer
419 * from premature deallocation because map object can be
420 * released earlier than aio write request started on
421 * map->aio.data[] buffer is complete.
423 * perf_mmap__put() is done at record__aio_complete()
424 * after started aio request completion or at record__aio_push()
425 * if the request failed to start.
427 perf_mmap__get(&map->core);
435 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
438 int trace_fd = rec->session->data->file.fd;
439 struct record_aio aio = { .rec = rec, .size = 0 };
442 * Call record__aio_sync() to wait till map->aio.data[] buffer
443 * becomes available after previous aio write operation.
446 idx = record__aio_sync(map, false);
447 aio.data = map->aio.data[idx];
448 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
449 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
456 rec->bytes_written += aio.size;
457 if (switch_output_size(rec))
458 trigger_hit(&switch_output_trigger);
461 * Decrement map->refcount incremented in record__aio_pushfn()
462 * back if record__aio_write() operation failed to start, otherwise
463 * map->refcount is decremented in record__aio_complete() after
464 * aio write operation finishes successfully.
466 perf_mmap__put(&map->core);
472 static off_t record__aio_get_pos(int trace_fd)
474 return lseek(trace_fd, 0, SEEK_CUR);
477 static void record__aio_set_pos(int trace_fd, off_t pos)
479 lseek(trace_fd, pos, SEEK_SET);
482 static void record__aio_mmap_read_sync(struct record *rec)
485 struct evlist *evlist = rec->evlist;
486 struct mmap *maps = evlist->mmap;
488 if (!record__aio_enabled(rec))
491 for (i = 0; i < evlist->core.nr_mmaps; i++) {
492 struct mmap *map = &maps[i];
495 record__aio_sync(map, true);
499 static int nr_cblocks_default = 1;
500 static int nr_cblocks_max = 4;
502 static int record__aio_parse(const struct option *opt,
506 struct record_opts *opts = (struct record_opts *)opt->value;
509 opts->nr_cblocks = 0;
512 opts->nr_cblocks = strtol(str, NULL, 0);
513 if (!opts->nr_cblocks)
514 opts->nr_cblocks = nr_cblocks_default;
519 #else /* HAVE_AIO_SUPPORT */
520 static int nr_cblocks_max = 0;
522 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
523 off_t *off __maybe_unused)
528 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
533 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
542 static int record__aio_enabled(struct record *rec)
544 return rec->opts.nr_cblocks > 0;
547 #define MMAP_FLUSH_DEFAULT 1
548 static int record__mmap_flush_parse(const struct option *opt,
553 struct record_opts *opts = (struct record_opts *)opt->value;
554 static struct parse_tag tags[] = {
555 { .tag = 'B', .mult = 1 },
556 { .tag = 'K', .mult = 1 << 10 },
557 { .tag = 'M', .mult = 1 << 20 },
558 { .tag = 'G', .mult = 1 << 30 },
566 opts->mmap_flush = parse_tag_value(str, tags);
567 if (opts->mmap_flush == (int)-1)
568 opts->mmap_flush = strtol(str, NULL, 0);
571 if (!opts->mmap_flush)
572 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
574 flush_max = evlist__mmap_size(opts->mmap_pages);
576 if (opts->mmap_flush > flush_max)
577 opts->mmap_flush = flush_max;
582 #ifdef HAVE_ZSTD_SUPPORT
583 static unsigned int comp_level_default = 1;
585 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
587 struct record_opts *opts = opt->value;
590 opts->comp_level = 0;
593 opts->comp_level = strtol(str, NULL, 0);
594 if (!opts->comp_level)
595 opts->comp_level = comp_level_default;
601 static unsigned int comp_level_max = 22;
603 static int record__comp_enabled(struct record *rec)
605 return rec->opts.comp_level > 0;
608 static int process_synthesized_event(struct perf_tool *tool,
609 union perf_event *event,
610 struct perf_sample *sample __maybe_unused,
611 struct machine *machine __maybe_unused)
613 struct record *rec = container_of(tool, struct record, tool);
614 return record__write(rec, NULL, event, event->header.size);
617 static struct mutex synth_lock;
619 static int process_locked_synthesized_event(struct perf_tool *tool,
620 union perf_event *event,
621 struct perf_sample *sample __maybe_unused,
622 struct machine *machine __maybe_unused)
626 mutex_lock(&synth_lock);
627 ret = process_synthesized_event(tool, event, sample, machine);
628 mutex_unlock(&synth_lock);
632 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
634 struct record *rec = to;
636 if (record__comp_enabled(rec)) {
637 size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
642 return record__write(rec, map, bf, size);
645 static volatile sig_atomic_t signr = -1;
646 static volatile sig_atomic_t child_finished;
647 #ifdef HAVE_EVENTFD_SUPPORT
648 static volatile sig_atomic_t done_fd = -1;
651 static void sig_handler(int sig)
659 #ifdef HAVE_EVENTFD_SUPPORT
662 int orig_errno = errno;
665 * It is possible for this signal handler to run after done is
666 * checked in the main loop, but before the perf counter fds are
667 * polled. If this happens, the poll() will continue to wait
668 * even though done is set, and will only break out if either
669 * another signal is received, or the counters are ready for
670 * read. To ensure the poll() doesn't sleep when done is set,
671 * use an eventfd (done_fd) to wake up the poll().
673 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
674 pr_err("failed to signal wakeup fd, error: %m\n");
678 #endif // HAVE_EVENTFD_SUPPORT
681 static void sigsegv_handler(int sig)
683 perf_hooks__recover();
684 sighandler_dump_stack(sig);
687 static void record__sig_exit(void)
692 signal(signr, SIG_DFL);
696 #ifdef HAVE_AUXTRACE_SUPPORT
698 static int record__process_auxtrace(struct perf_tool *tool,
700 union perf_event *event, void *data1,
701 size_t len1, void *data2, size_t len2)
703 struct record *rec = container_of(tool, struct record, tool);
704 struct perf_data *data = &rec->data;
708 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
710 int fd = perf_data__fd(data);
713 file_offset = lseek(fd, 0, SEEK_CUR);
714 if (file_offset == -1)
716 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
722 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
723 padding = (len1 + len2) & 7;
725 padding = 8 - padding;
727 record__write(rec, map, event, event->header.size);
728 record__write(rec, map, data1, len1);
730 record__write(rec, map, data2, len2);
731 record__write(rec, map, &pad, padding);
736 static int record__auxtrace_mmap_read(struct record *rec,
741 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
742 record__process_auxtrace);
752 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
757 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
758 record__process_auxtrace,
759 rec->opts.auxtrace_snapshot_size);
769 static int record__auxtrace_read_snapshot_all(struct record *rec)
774 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
775 struct mmap *map = &rec->evlist->mmap[i];
777 if (!map->auxtrace_mmap.base)
780 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
789 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
791 pr_debug("Recording AUX area tracing snapshot\n");
792 if (record__auxtrace_read_snapshot_all(rec) < 0) {
793 trigger_error(&auxtrace_snapshot_trigger);
795 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
796 trigger_error(&auxtrace_snapshot_trigger);
798 trigger_ready(&auxtrace_snapshot_trigger);
802 static int record__auxtrace_snapshot_exit(struct record *rec)
804 if (trigger_is_error(&auxtrace_snapshot_trigger))
807 if (!auxtrace_record__snapshot_started &&
808 auxtrace_record__snapshot_start(rec->itr))
811 record__read_auxtrace_snapshot(rec, true);
812 if (trigger_is_error(&auxtrace_snapshot_trigger))
818 static int record__auxtrace_init(struct record *rec)
822 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
823 && record__threads_enabled(rec)) {
824 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
829 rec->itr = auxtrace_record__init(rec->evlist, &err);
834 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
835 rec->opts.auxtrace_snapshot_opts);
839 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
840 rec->opts.auxtrace_sample_opts);
844 auxtrace_regroup_aux_output(rec->evlist);
846 return auxtrace_parse_filters(rec->evlist);
852 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
853 struct mmap *map __maybe_unused)
859 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
860 bool on_exit __maybe_unused)
865 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
871 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
876 static int record__auxtrace_init(struct record *rec __maybe_unused)
883 static int record__config_text_poke(struct evlist *evlist)
887 /* Nothing to do if text poke is already configured */
888 evlist__for_each_entry(evlist, evsel) {
889 if (evsel->core.attr.text_poke)
893 evsel = evlist__add_dummy_on_all_cpus(evlist);
897 evsel->core.attr.text_poke = 1;
898 evsel->core.attr.ksymbol = 1;
899 evsel->immediate = true;
900 evsel__set_sample_bit(evsel, TIME);
905 static int record__config_off_cpu(struct record *rec)
907 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
910 static bool record__kcore_readable(struct machine *machine)
912 char kcore[PATH_MAX];
915 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
917 fd = open(kcore, O_RDONLY);
926 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
928 char from_dir[PATH_MAX];
929 char kcore_dir[PATH_MAX];
932 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
934 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
938 return kcore_copy(from_dir, kcore_dir);
941 static void record__thread_data_init_pipes(struct record_thread *thread_data)
943 thread_data->pipes.msg[0] = -1;
944 thread_data->pipes.msg[1] = -1;
945 thread_data->pipes.ack[0] = -1;
946 thread_data->pipes.ack[1] = -1;
949 static int record__thread_data_open_pipes(struct record_thread *thread_data)
951 if (pipe(thread_data->pipes.msg))
954 if (pipe(thread_data->pipes.ack)) {
955 close(thread_data->pipes.msg[0]);
956 thread_data->pipes.msg[0] = -1;
957 close(thread_data->pipes.msg[1]);
958 thread_data->pipes.msg[1] = -1;
962 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
963 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
964 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
969 static void record__thread_data_close_pipes(struct record_thread *thread_data)
971 if (thread_data->pipes.msg[0] != -1) {
972 close(thread_data->pipes.msg[0]);
973 thread_data->pipes.msg[0] = -1;
975 if (thread_data->pipes.msg[1] != -1) {
976 close(thread_data->pipes.msg[1]);
977 thread_data->pipes.msg[1] = -1;
979 if (thread_data->pipes.ack[0] != -1) {
980 close(thread_data->pipes.ack[0]);
981 thread_data->pipes.ack[0] = -1;
983 if (thread_data->pipes.ack[1] != -1) {
984 close(thread_data->pipes.ack[1]);
985 thread_data->pipes.ack[1] = -1;
989 static bool evlist__per_thread(struct evlist *evlist)
991 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
994 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
996 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
997 struct mmap *mmap = evlist->mmap;
998 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
999 struct perf_cpu_map *cpus = evlist->core.all_cpus;
1000 bool per_thread = evlist__per_thread(evlist);
1003 thread_data->nr_mmaps = nr_mmaps;
1005 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1006 thread_data->mask->maps.nbits);
1008 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1009 if (!thread_data->maps)
1012 if (overwrite_mmap) {
1013 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1014 if (!thread_data->overwrite_maps) {
1015 zfree(&thread_data->maps);
1019 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1020 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1022 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1024 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1025 if (thread_data->maps) {
1026 thread_data->maps[tm] = &mmap[m];
1027 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1028 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1030 if (thread_data->overwrite_maps) {
1031 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1032 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1033 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1042 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1045 struct mmap *map, *overwrite_map;
1047 fdarray__init(&thread_data->pollfd, 64);
1049 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1050 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1051 overwrite_map = thread_data->overwrite_maps ?
1052 thread_data->overwrite_maps[tm] : NULL;
1054 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1055 void *ptr = evlist->core.pollfd.priv[f].ptr;
1057 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1058 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1059 &evlist->core.pollfd);
1062 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1063 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1071 static void record__free_thread_data(struct record *rec)
1074 struct record_thread *thread_data = rec->thread_data;
1076 if (thread_data == NULL)
1079 for (t = 0; t < rec->nr_threads; t++) {
1080 record__thread_data_close_pipes(&thread_data[t]);
1081 zfree(&thread_data[t].maps);
1082 zfree(&thread_data[t].overwrite_maps);
1083 fdarray__exit(&thread_data[t].pollfd);
1086 zfree(&rec->thread_data);
1089 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1090 int evlist_pollfd_index,
1091 int thread_pollfd_index)
1093 size_t x = rec->index_map_cnt;
1095 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1097 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1098 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1099 rec->index_map_cnt += 1;
1103 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1104 struct evlist *evlist,
1105 struct record_thread *thread_data)
1107 struct pollfd *e_entries = evlist->core.pollfd.entries;
1108 struct pollfd *t_entries = thread_data->pollfd.entries;
1112 for (i = 0; i < rec->index_map_cnt; i++) {
1113 int e_pos = rec->index_map[i].evlist_pollfd_index;
1114 int t_pos = rec->index_map[i].thread_pollfd_index;
1116 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1117 e_entries[e_pos].events != t_entries[t_pos].events) {
1118 pr_err("Thread and evlist pollfd index mismatch\n");
1122 e_entries[e_pos].revents = t_entries[t_pos].revents;
1127 static int record__dup_non_perf_events(struct record *rec,
1128 struct evlist *evlist,
1129 struct record_thread *thread_data)
1131 struct fdarray *fda = &evlist->core.pollfd;
1134 for (i = 0; i < fda->nr; i++) {
1135 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1137 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1139 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1142 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1143 thread_data, ret, fda->entries[i].fd);
1144 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1146 pr_err("Failed to map thread and evlist pollfd indexes\n");
1153 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1156 struct record_thread *thread_data;
1158 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1159 if (!rec->thread_data) {
1160 pr_err("Failed to allocate thread data\n");
1163 thread_data = rec->thread_data;
1165 for (t = 0; t < rec->nr_threads; t++)
1166 record__thread_data_init_pipes(&thread_data[t]);
1168 for (t = 0; t < rec->nr_threads; t++) {
1169 thread_data[t].rec = rec;
1170 thread_data[t].mask = &rec->thread_masks[t];
1171 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1173 pr_err("Failed to initialize thread[%d] maps\n", t);
1176 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1178 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1182 thread_data[t].tid = -1;
1183 ret = record__thread_data_open_pipes(&thread_data[t]);
1185 pr_err("Failed to open thread[%d] communication pipes\n", t);
1188 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1189 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1191 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1194 thread_data[t].ctlfd_pos = ret;
1195 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1196 thread_data, thread_data[t].ctlfd_pos,
1197 thread_data[t].pipes.msg[0]);
1199 thread_data[t].tid = gettid();
1201 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1205 thread_data[t].ctlfd_pos = -1; /* Not used */
1212 record__free_thread_data(rec);
1217 static int record__mmap_evlist(struct record *rec,
1218 struct evlist *evlist)
1221 struct record_opts *opts = &rec->opts;
1222 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1223 opts->auxtrace_sample_mode;
1226 if (opts->affinity != PERF_AFFINITY_SYS)
1227 cpu__setup_cpunode_map();
1229 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1230 opts->auxtrace_mmap_pages,
1232 opts->nr_cblocks, opts->affinity,
1233 opts->mmap_flush, opts->comp_level) < 0) {
1234 if (errno == EPERM) {
1235 pr_err("Permission error mapping pages.\n"
1236 "Consider increasing "
1237 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1238 "or try again with a smaller value of -m/--mmap_pages.\n"
1239 "(current value: %u,%u)\n",
1240 opts->mmap_pages, opts->auxtrace_mmap_pages);
1243 pr_err("failed to mmap with %d (%s)\n", errno,
1244 str_error_r(errno, msg, sizeof(msg)));
1252 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1255 ret = record__alloc_thread_data(rec, evlist);
1259 if (record__threads_enabled(rec)) {
1260 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1262 pr_err("Failed to create data directory: %s\n", strerror(-ret));
1265 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1267 evlist->mmap[i].file = &rec->data.dir.files[i];
1268 if (evlist->overwrite_mmap)
1269 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1276 static int record__mmap(struct record *rec)
1278 return record__mmap_evlist(rec, rec->evlist);
1281 static int record__open(struct record *rec)
1285 struct evlist *evlist = rec->evlist;
1286 struct perf_session *session = rec->session;
1287 struct record_opts *opts = &rec->opts;
1291 * For initial_delay, system wide or a hybrid system, we need to add a
1292 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1293 * of waiting or event synthesis.
1295 if (opts->initial_delay || target__has_cpu(&opts->target) ||
1296 perf_pmu__has_hybrid()) {
1297 pos = evlist__get_tracking_event(evlist);
1298 if (!evsel__is_dummy_event(pos)) {
1299 /* Set up dummy event. */
1300 if (evlist__add_dummy(evlist))
1302 pos = evlist__last(evlist);
1303 evlist__set_tracking_event(evlist, pos);
1307 * Enable the dummy event when the process is forked for
1308 * initial_delay, immediately for system wide.
1310 if (opts->initial_delay && !pos->immediate &&
1311 !target__has_cpu(&opts->target))
1312 pos->core.attr.enable_on_exec = 1;
1317 evlist__config(evlist, opts, &callchain_param);
1319 evlist__for_each_entry(evlist, pos) {
1321 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1322 if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1324 ui__warning("%s\n", msg);
1327 if ((errno == EINVAL || errno == EBADF) &&
1328 pos->core.leader != &pos->core &&
1330 pos = evlist__reset_weak_group(evlist, pos, true);
1334 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1335 ui__error("%s\n", msg);
1339 pos->supported = true;
1342 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1344 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1345 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1346 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1347 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1348 "Samples in kernel modules won't be resolved at all.\n\n"
1349 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1350 "even with a suitable vmlinux or kallsyms file.\n\n");
1353 if (evlist__apply_filters(evlist, &pos)) {
1354 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1355 pos->filter, evsel__name(pos), errno,
1356 str_error_r(errno, msg, sizeof(msg)));
1361 rc = record__mmap(rec);
1365 session->evlist = evlist;
1366 perf_session__set_id_hdr_size(session);
1371 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1373 if (rec->evlist->first_sample_time == 0)
1374 rec->evlist->first_sample_time = sample_time;
1377 rec->evlist->last_sample_time = sample_time;
1380 static int process_sample_event(struct perf_tool *tool,
1381 union perf_event *event,
1382 struct perf_sample *sample,
1383 struct evsel *evsel,
1384 struct machine *machine)
1386 struct record *rec = container_of(tool, struct record, tool);
1388 set_timestamp_boundary(rec, sample->time);
1390 if (rec->buildid_all)
1394 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1397 static int process_buildids(struct record *rec)
1399 struct perf_session *session = rec->session;
1401 if (perf_data__size(&rec->data) == 0)
1405 * During this process, it'll load kernel map and replace the
1406 * dso->long_name to a real pathname it found. In this case
1407 * we prefer the vmlinux path like
1408 * /lib/modules/3.16.4/build/vmlinux
1410 * rather than build-id path (in debug directory).
1411 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1413 symbol_conf.ignore_vmlinux_buildid = true;
1416 * If --buildid-all is given, it marks all DSO regardless of hits,
1417 * so no need to process samples. But if timestamp_boundary is enabled,
1418 * it still needs to walk on all samples to get the timestamps of
1419 * first/last samples.
1421 if (rec->buildid_all && !rec->timestamp_boundary)
1422 rec->tool.sample = NULL;
1424 return perf_session__process_events(session);
1427 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1430 struct perf_tool *tool = data;
1432 *As for guest kernel when processing subcommand record&report,
1433 *we arrange module mmap prior to guest kernel mmap and trigger
1434 *a preload dso because default guest module symbols are loaded
1435 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1436 *method is used to avoid symbol missing when the first addr is
1437 *in module instead of in guest kernel.
1439 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1442 pr_err("Couldn't record guest kernel [%d]'s reference"
1443 " relocation symbol.\n", machine->pid);
1446 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1447 * have no _text sometimes.
1449 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1452 pr_err("Couldn't record guest kernel [%d]'s reference"
1453 " relocation symbol.\n", machine->pid);
1456 static struct perf_event_header finished_round_event = {
1457 .size = sizeof(struct perf_event_header),
1458 .type = PERF_RECORD_FINISHED_ROUND,
1461 static struct perf_event_header finished_init_event = {
1462 .size = sizeof(struct perf_event_header),
1463 .type = PERF_RECORD_FINISHED_INIT,
1466 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1468 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1469 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1470 thread->mask->affinity.nbits)) {
1471 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1472 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1473 map->affinity_mask.bits, thread->mask->affinity.nbits);
1474 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1475 (cpu_set_t *)thread->mask->affinity.bits);
1477 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1478 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1483 static size_t process_comp_header(void *record, size_t increment)
1485 struct perf_record_compressed *event = record;
1486 size_t size = sizeof(*event);
1489 event->header.size += increment;
1493 event->header.type = PERF_RECORD_COMPRESSED;
1494 event->header.size = size;
1499 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1500 void *dst, size_t dst_size, void *src, size_t src_size)
1503 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1504 struct zstd_data *zstd_data = &session->zstd_data;
1506 if (map && map->file)
1507 zstd_data = &map->zstd_data;
1509 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1510 max_record_size, process_comp_header);
1512 if (map && map->file) {
1513 thread->bytes_transferred += src_size;
1514 thread->bytes_compressed += compressed;
1516 session->bytes_transferred += src_size;
1517 session->bytes_compressed += compressed;
1523 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1524 bool overwrite, bool synch)
1526 u64 bytes_written = rec->bytes_written;
1531 int trace_fd = rec->data.file.fd;
1537 nr_mmaps = thread->nr_mmaps;
1538 maps = overwrite ? thread->overwrite_maps : thread->maps;
1543 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1546 if (record__aio_enabled(rec))
1547 off = record__aio_get_pos(trace_fd);
1549 for (i = 0; i < nr_mmaps; i++) {
1551 struct mmap *map = maps[i];
1553 if (map->core.base) {
1554 record__adjust_affinity(rec, map);
1556 flush = map->core.flush;
1557 map->core.flush = 1;
1559 if (!record__aio_enabled(rec)) {
1560 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1562 map->core.flush = flush;
1567 if (record__aio_push(rec, map, &off) < 0) {
1568 record__aio_set_pos(trace_fd, off);
1570 map->core.flush = flush;
1576 map->core.flush = flush;
1579 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1580 !rec->opts.auxtrace_sample_mode &&
1581 record__auxtrace_mmap_read(rec, map) != 0) {
1587 if (record__aio_enabled(rec))
1588 record__aio_set_pos(trace_fd, off);
1591 * Mark the round finished in case we wrote
1592 * at least one event.
1594 * No need for round events in directory mode,
1595 * because per-cpu maps and files have data
1598 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1599 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1602 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1607 static int record__mmap_read_all(struct record *rec, bool synch)
1611 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1615 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1618 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1619 void *arg __maybe_unused)
1621 struct perf_mmap *map = fda->priv[fd].ptr;
1624 perf_mmap__put(map);
1627 static void *record__thread(void *arg)
1629 enum thread_msg msg = THREAD_MSG__READY;
1630 bool terminate = false;
1631 struct fdarray *pollfd;
1635 thread->tid = gettid();
1637 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1639 pr_warning("threads[%d]: failed to notify on start: %s\n",
1640 thread->tid, strerror(errno));
1642 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1644 pollfd = &thread->pollfd;
1645 ctlfd_pos = thread->ctlfd_pos;
1648 unsigned long long hits = thread->samples;
1650 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1653 if (hits == thread->samples) {
1655 err = fdarray__poll(pollfd, -1);
1657 * Propagate error, only if there's any. Ignore positive
1658 * number of returned events and interrupt error.
1660 if (err > 0 || (err < 0 && errno == EINTR))
1664 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1665 record__thread_munmap_filtered, NULL) == 0)
1669 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1671 close(thread->pipes.msg[0]);
1672 thread->pipes.msg[0] = -1;
1673 pollfd->entries[ctlfd_pos].fd = -1;
1674 pollfd->entries[ctlfd_pos].events = 0;
1677 pollfd->entries[ctlfd_pos].revents = 0;
1679 record__mmap_read_all(thread->rec, true);
1681 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1683 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1684 thread->tid, strerror(errno));
1689 static void record__init_features(struct record *rec)
1691 struct perf_session *session = rec->session;
1694 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1695 perf_header__set_feat(&session->header, feat);
1697 if (rec->no_buildid)
1698 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1700 #ifdef HAVE_LIBTRACEEVENT
1701 if (!have_tracepoints(&rec->evlist->core.entries))
1702 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1705 if (!rec->opts.branch_stack)
1706 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1708 if (!rec->opts.full_auxtrace)
1709 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1711 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1712 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1714 if (!rec->opts.use_clockid)
1715 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1717 if (!record__threads_enabled(rec))
1718 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1720 if (!record__comp_enabled(rec))
1721 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1723 perf_header__clear_feat(&session->header, HEADER_STAT);
1727 record__finish_output(struct record *rec)
1730 struct perf_data *data = &rec->data;
1731 int fd = perf_data__fd(data);
1736 rec->session->header.data_size += rec->bytes_written;
1737 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1738 if (record__threads_enabled(rec)) {
1739 for (i = 0; i < data->dir.nr; i++)
1740 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1743 if (!rec->no_buildid) {
1744 process_buildids(rec);
1746 if (rec->buildid_all)
1747 dsos__hit_all(rec->session);
1749 perf_session__write_header(rec->session, rec->evlist, fd, true);
1754 static int record__synthesize_workload(struct record *rec, bool tail)
1757 struct perf_thread_map *thread_map;
1758 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1760 if (rec->opts.tail_synthesize != tail)
1763 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1764 if (thread_map == NULL)
1767 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1768 process_synthesized_event,
1769 &rec->session->machines.host,
1771 rec->opts.sample_address);
1772 perf_thread_map__put(thread_map);
1776 static int write_finished_init(struct record *rec, bool tail)
1778 if (rec->opts.tail_synthesize != tail)
1781 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1784 static int record__synthesize(struct record *rec, bool tail);
1787 record__switch_output(struct record *rec, bool at_exit)
1789 struct perf_data *data = &rec->data;
1793 /* Same Size: "2015122520103046"*/
1794 char timestamp[] = "InvalidTimestamp";
1796 record__aio_mmap_read_sync(rec);
1798 write_finished_init(rec, true);
1800 record__synthesize(rec, true);
1801 if (target__none(&rec->opts.target))
1802 record__synthesize_workload(rec, true);
1805 record__finish_output(rec);
1806 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1808 pr_err("Failed to get current timestamp\n");
1812 fd = perf_data__switch(data, timestamp,
1813 rec->session->header.data_offset,
1814 at_exit, &new_filename);
1815 if (fd >= 0 && !at_exit) {
1816 rec->bytes_written = 0;
1817 rec->session->header.data_size = 0;
1821 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1822 data->path, timestamp);
1824 if (rec->switch_output.num_files) {
1825 int n = rec->switch_output.cur_file + 1;
1827 if (n >= rec->switch_output.num_files)
1829 rec->switch_output.cur_file = n;
1830 if (rec->switch_output.filenames[n]) {
1831 remove(rec->switch_output.filenames[n]);
1832 zfree(&rec->switch_output.filenames[n]);
1834 rec->switch_output.filenames[n] = new_filename;
1839 /* Output tracking events */
1841 record__synthesize(rec, false);
1844 * In 'perf record --switch-output' without -a,
1845 * record__synthesize() in record__switch_output() won't
1846 * generate tracking events because there's no thread_map
1847 * in evlist. Which causes newly created perf.data doesn't
1848 * contain map and comm information.
1849 * Create a fake thread_map and directly call
1850 * perf_event__synthesize_thread_map() for those events.
1852 if (target__none(&rec->opts.target))
1853 record__synthesize_workload(rec, false);
1854 write_finished_init(rec, false);
1859 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1860 struct perf_record_lost_samples *lost,
1861 int cpu_idx, int thread_idx)
1863 struct perf_counts_values count;
1864 struct perf_sample_id *sid;
1865 struct perf_sample sample = {};
1868 if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1869 pr_debug("read LOST count failed\n");
1873 if (count.lost == 0)
1876 lost->lost = count.lost;
1877 if (evsel->core.ids) {
1878 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1879 sample.id = sid->id;
1882 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1883 evsel->core.attr.sample_type, &sample);
1884 lost->header.size = sizeof(*lost) + id_hdr_size;
1885 record__write(rec, NULL, lost, lost->header.size);
1888 static void record__read_lost_samples(struct record *rec)
1890 struct perf_session *session = rec->session;
1891 struct perf_record_lost_samples *lost;
1892 struct evsel *evsel;
1894 /* there was an error during record__open */
1895 if (session->evlist == NULL)
1898 lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1900 pr_debug("Memory allocation failed\n");
1904 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1906 evlist__for_each_entry(session->evlist, evsel) {
1907 struct xyarray *xy = evsel->core.sample_id;
1909 if (xy == NULL || evsel->core.fd == NULL)
1911 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1912 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1913 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1917 for (int x = 0; x < xyarray__max_x(xy); x++) {
1918 for (int y = 0; y < xyarray__max_y(xy); y++) {
1919 __record__read_lost_samples(rec, evsel, lost, x, y);
1927 static volatile sig_atomic_t workload_exec_errno;
1930 * evlist__prepare_workload will send a SIGUSR1
1931 * if the fork fails, since we asked by setting its
1932 * want_signal to true.
1934 static void workload_exec_failed_signal(int signo __maybe_unused,
1936 void *ucontext __maybe_unused)
1938 workload_exec_errno = info->si_value.sival_int;
1943 static void snapshot_sig_handler(int sig);
1944 static void alarm_sig_handler(int sig);
1946 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1949 if (evlist->mmap && evlist->mmap[0].core.base)
1950 return evlist->mmap[0].core.base;
1951 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1952 return evlist->overwrite_mmap[0].core.base;
1957 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1959 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1965 static int record__synthesize(struct record *rec, bool tail)
1967 struct perf_session *session = rec->session;
1968 struct machine *machine = &session->machines.host;
1969 struct perf_data *data = &rec->data;
1970 struct record_opts *opts = &rec->opts;
1971 struct perf_tool *tool = &rec->tool;
1973 event_op f = process_synthesized_event;
1975 if (rec->opts.tail_synthesize != tail)
1978 if (data->is_pipe) {
1979 err = perf_event__synthesize_for_pipe(tool, session, data,
1980 process_synthesized_event);
1984 rec->bytes_written += err;
1987 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1988 process_synthesized_event, machine);
1992 /* Synthesize id_index before auxtrace_info */
1993 err = perf_event__synthesize_id_index(tool,
1994 process_synthesized_event,
1995 session->evlist, machine);
1999 if (rec->opts.full_auxtrace) {
2000 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2001 session, process_synthesized_event);
2006 if (!evlist__exclude_kernel(rec->evlist)) {
2007 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2009 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2010 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2011 "Check /proc/kallsyms permission or run as root.\n");
2013 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2015 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2016 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2017 "Check /proc/modules permission or run as root.\n");
2021 machines__process_guests(&session->machines,
2022 perf_event__synthesize_guest_os, tool);
2025 err = perf_event__synthesize_extra_attr(&rec->tool,
2027 process_synthesized_event,
2032 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2033 process_synthesized_event,
2036 pr_err("Couldn't synthesize thread map.\n");
2040 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2041 process_synthesized_event, NULL);
2043 pr_err("Couldn't synthesize cpu map.\n");
2047 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2050 pr_warning("Couldn't synthesize bpf events.\n");
2054 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2055 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2058 pr_warning("Couldn't synthesize cgroup events.\n");
2063 if (rec->opts.nr_threads_synthesize > 1) {
2064 mutex_init(&synth_lock);
2065 perf_set_multithreaded();
2066 f = process_locked_synthesized_event;
2069 if (rec->opts.synth & PERF_SYNTH_TASK) {
2070 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2072 err = __machine__synthesize_threads(machine, tool, &opts->target,
2073 rec->evlist->core.threads,
2074 f, needs_mmap, opts->sample_address,
2075 rec->opts.nr_threads_synthesize);
2078 if (rec->opts.nr_threads_synthesize > 1) {
2079 perf_set_singlethreaded();
2080 mutex_destroy(&synth_lock);
2087 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2089 struct record *rec = data;
2090 pthread_kill(rec->thread_id, SIGUSR2);
2094 static int record__setup_sb_evlist(struct record *rec)
2096 struct record_opts *opts = &rec->opts;
2098 if (rec->sb_evlist != NULL) {
2100 * We get here if --switch-output-event populated the
2101 * sb_evlist, so associate a callback that will send a SIGUSR2
2102 * to the main thread.
2104 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2105 rec->thread_id = pthread_self();
2107 #ifdef HAVE_LIBBPF_SUPPORT
2108 if (!opts->no_bpf_event) {
2109 if (rec->sb_evlist == NULL) {
2110 rec->sb_evlist = evlist__new();
2112 if (rec->sb_evlist == NULL) {
2113 pr_err("Couldn't create side band evlist.\n.");
2118 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2119 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2124 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2125 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2126 opts->no_bpf_event = true;
2132 static int record__init_clock(struct record *rec)
2134 struct perf_session *session = rec->session;
2135 struct timespec ref_clockid;
2136 struct timeval ref_tod;
2139 if (!rec->opts.use_clockid)
2142 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2143 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2145 session->header.env.clock.clockid = rec->opts.clockid;
2147 if (gettimeofday(&ref_tod, NULL) != 0) {
2148 pr_err("gettimeofday failed, cannot set reference time.\n");
2152 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2153 pr_err("clock_gettime failed, cannot set reference time.\n");
2157 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2158 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2160 session->header.env.clock.tod_ns = ref;
2162 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2163 (u64) ref_clockid.tv_nsec;
2165 session->header.env.clock.clockid_ns = ref;
2169 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2171 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2172 trigger_hit(&auxtrace_snapshot_trigger);
2173 auxtrace_record__snapshot_started = 1;
2174 if (auxtrace_record__snapshot_start(rec->itr))
2175 trigger_error(&auxtrace_snapshot_trigger);
2179 static void record__uniquify_name(struct record *rec)
2182 struct evlist *evlist = rec->evlist;
2186 if (!perf_pmu__has_hybrid())
2189 evlist__for_each_entry(evlist, pos) {
2190 if (!evsel__is_hybrid(pos))
2193 if (strchr(pos->name, '/'))
2196 ret = asprintf(&new_name, "%s/%s/",
2197 pos->pmu_name, pos->name);
2200 pos->name = new_name;
2205 static int record__terminate_thread(struct record_thread *thread_data)
2208 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2209 pid_t tid = thread_data->tid;
2211 close(thread_data->pipes.msg[1]);
2212 thread_data->pipes.msg[1] = -1;
2213 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2215 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2217 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2223 static int record__start_threads(struct record *rec)
2225 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2226 struct record_thread *thread_data = rec->thread_data;
2227 sigset_t full, mask;
2229 pthread_attr_t attrs;
2231 thread = &thread_data[0];
2233 if (!record__threads_enabled(rec))
2237 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2238 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2242 pthread_attr_init(&attrs);
2243 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2245 for (t = 1; t < nr_threads; t++) {
2246 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2248 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2249 pthread_attr_setaffinity_np(&attrs,
2250 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2251 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2253 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2254 for (tt = 1; tt < t; tt++)
2255 record__terminate_thread(&thread_data[t]);
2256 pr_err("Failed to start threads: %s\n", strerror(errno));
2261 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2263 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2264 thread_msg_tags[msg]);
2266 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2267 thread->tid, rec->thread_data[t].tid);
2270 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2271 (cpu_set_t *)thread->mask->affinity.bits);
2273 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2276 pthread_attr_destroy(&attrs);
2278 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2279 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2286 static int record__stop_threads(struct record *rec)
2289 struct record_thread *thread_data = rec->thread_data;
2291 for (t = 1; t < rec->nr_threads; t++)
2292 record__terminate_thread(&thread_data[t]);
2294 for (t = 0; t < rec->nr_threads; t++) {
2295 rec->samples += thread_data[t].samples;
2296 if (!record__threads_enabled(rec))
2298 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2299 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2300 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2301 thread_data[t].samples, thread_data[t].waking);
2302 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2303 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2304 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2306 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2312 static unsigned long record__waking(struct record *rec)
2315 unsigned long waking = 0;
2316 struct record_thread *thread_data = rec->thread_data;
2318 for (t = 0; t < rec->nr_threads; t++)
2319 waking += thread_data[t].waking;
2324 static int __cmd_record(struct record *rec, int argc, const char **argv)
2328 const bool forks = argc > 0;
2329 struct perf_tool *tool = &rec->tool;
2330 struct record_opts *opts = &rec->opts;
2331 struct perf_data *data = &rec->data;
2332 struct perf_session *session;
2333 bool disabled = false, draining = false;
2336 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2338 atexit(record__sig_exit);
2339 signal(SIGCHLD, sig_handler);
2340 signal(SIGINT, sig_handler);
2341 signal(SIGTERM, sig_handler);
2342 signal(SIGSEGV, sigsegv_handler);
2344 if (rec->opts.record_namespaces)
2345 tool->namespace_events = true;
2347 if (rec->opts.record_cgroup) {
2348 #ifdef HAVE_FILE_HANDLE
2349 tool->cgroup_events = true;
2351 pr_err("cgroup tracking is not supported\n");
2356 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2357 signal(SIGUSR2, snapshot_sig_handler);
2358 if (rec->opts.auxtrace_snapshot_mode)
2359 trigger_on(&auxtrace_snapshot_trigger);
2360 if (rec->switch_output.enabled)
2361 trigger_on(&switch_output_trigger);
2363 signal(SIGUSR2, SIG_IGN);
2366 session = perf_session__new(data, tool);
2367 if (IS_ERR(session)) {
2368 pr_err("Perf session creation failed.\n");
2369 return PTR_ERR(session);
2372 if (record__threads_enabled(rec)) {
2373 if (perf_data__is_pipe(&rec->data)) {
2374 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2377 if (rec->opts.full_auxtrace) {
2378 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2383 fd = perf_data__fd(data);
2384 rec->session = session;
2386 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2387 pr_err("Compression initialization failed.\n");
2390 #ifdef HAVE_EVENTFD_SUPPORT
2391 done_fd = eventfd(0, EFD_NONBLOCK);
2393 pr_err("Failed to create wakeup eventfd, error: %m\n");
2395 goto out_delete_session;
2397 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2399 pr_err("Failed to add wakeup eventfd to poll list\n");
2401 goto out_delete_session;
2403 #endif // HAVE_EVENTFD_SUPPORT
2405 session->header.env.comp_type = PERF_COMP_ZSTD;
2406 session->header.env.comp_level = rec->opts.comp_level;
2408 if (rec->opts.kcore &&
2409 !record__kcore_readable(&session->machines.host)) {
2410 pr_err("ERROR: kcore is not readable.\n");
2414 if (record__init_clock(rec))
2417 record__init_features(rec);
2420 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2421 workload_exec_failed_signal);
2423 pr_err("Couldn't run the workload!\n");
2425 goto out_delete_session;
2430 * If we have just single event and are sending data
2431 * through pipe, we need to force the ids allocation,
2432 * because we synthesize event name through the pipe
2433 * and need the id for that.
2435 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2436 rec->opts.sample_id = true;
2438 record__uniquify_name(rec);
2440 /* Debug message used by test scripts */
2441 pr_debug3("perf record opening and mmapping events\n");
2442 if (record__open(rec) != 0) {
2444 goto out_free_threads;
2446 /* Debug message used by test scripts */
2447 pr_debug3("perf record done opening and mmapping events\n");
2448 session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2450 if (rec->opts.kcore) {
2451 err = record__kcore_copy(&session->machines.host, data);
2453 pr_err("ERROR: Failed to copy kcore\n");
2454 goto out_free_threads;
2458 err = bpf__apply_obj_config();
2460 char errbuf[BUFSIZ];
2462 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2463 pr_err("ERROR: Apply config to BPF failed: %s\n",
2465 goto out_free_threads;
2469 * Normally perf_session__new would do this, but it doesn't have the
2472 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2473 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2474 rec->tool.ordered_events = false;
2477 if (!rec->evlist->core.nr_groups)
2478 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2480 if (data->is_pipe) {
2481 err = perf_header__write_pipe(fd);
2483 goto out_free_threads;
2485 err = perf_session__write_header(session, rec->evlist, fd, false);
2487 goto out_free_threads;
2491 if (!rec->no_buildid
2492 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2493 pr_err("Couldn't generate buildids. "
2494 "Use --no-buildid to profile anyway.\n");
2495 goto out_free_threads;
2498 err = record__setup_sb_evlist(rec);
2500 goto out_free_threads;
2502 err = record__synthesize(rec, false);
2504 goto out_free_threads;
2506 if (rec->realtime_prio) {
2507 struct sched_param param;
2509 param.sched_priority = rec->realtime_prio;
2510 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2511 pr_err("Could not set realtime priority.\n");
2513 goto out_free_threads;
2517 if (record__start_threads(rec))
2518 goto out_free_threads;
2521 * When perf is starting the traced process, all the events
2522 * (apart from group members) have enable_on_exec=1 set,
2523 * so don't spoil it by prematurely enabling them.
2525 if (!target__none(&opts->target) && !opts->initial_delay)
2526 evlist__enable(rec->evlist);
2532 struct machine *machine = &session->machines.host;
2533 union perf_event *event;
2536 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2537 if (event == NULL) {
2543 * Some H/W events are generated before COMM event
2544 * which is emitted during exec(), so perf script
2545 * cannot see a correct process name for those events.
2546 * Synthesize COMM event to prevent it.
2548 tgid = perf_event__synthesize_comm(tool, event,
2549 rec->evlist->workload.pid,
2550 process_synthesized_event,
2557 event = malloc(sizeof(event->namespaces) +
2558 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2559 machine->id_hdr_size);
2560 if (event == NULL) {
2566 * Synthesize NAMESPACES event for the command specified.
2568 perf_event__synthesize_namespaces(tool, event,
2569 rec->evlist->workload.pid,
2570 tgid, process_synthesized_event,
2574 evlist__start_workload(rec->evlist);
2577 if (opts->initial_delay) {
2578 pr_info(EVLIST_DISABLED_MSG);
2579 if (opts->initial_delay > 0) {
2580 usleep(opts->initial_delay * USEC_PER_MSEC);
2581 evlist__enable(rec->evlist);
2582 pr_info(EVLIST_ENABLED_MSG);
2586 err = event_enable_timer__start(rec->evlist->eet);
2590 /* Debug message used by test scripts */
2591 pr_debug3("perf record has started\n");
2594 trigger_ready(&auxtrace_snapshot_trigger);
2595 trigger_ready(&switch_output_trigger);
2596 perf_hooks__invoke_record_start();
2599 * Must write FINISHED_INIT so it will be seen after all other
2600 * synthesized user events, but before any regular events.
2602 err = write_finished_init(rec, false);
2607 unsigned long long hits = thread->samples;
2610 * rec->evlist->bkw_mmap_state is possible to be
2611 * BKW_MMAP_EMPTY here: when done == true and
2612 * hits != rec->samples in previous round.
2614 * evlist__toggle_bkw_mmap ensure we never
2615 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2617 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2618 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2620 if (record__mmap_read_all(rec, false) < 0) {
2621 trigger_error(&auxtrace_snapshot_trigger);
2622 trigger_error(&switch_output_trigger);
2627 if (auxtrace_record__snapshot_started) {
2628 auxtrace_record__snapshot_started = 0;
2629 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2630 record__read_auxtrace_snapshot(rec, false);
2631 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2632 pr_err("AUX area tracing snapshot failed\n");
2638 if (trigger_is_hit(&switch_output_trigger)) {
2640 * If switch_output_trigger is hit, the data in
2641 * overwritable ring buffer should have been collected,
2642 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2644 * If SIGUSR2 raise after or during record__mmap_read_all(),
2645 * record__mmap_read_all() didn't collect data from
2646 * overwritable ring buffer. Read again.
2648 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2650 trigger_ready(&switch_output_trigger);
2653 * Reenable events in overwrite ring buffer after
2654 * record__mmap_read_all(): we should have collected
2657 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2660 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2661 record__waking(rec));
2663 fd = record__switch_output(rec, false);
2665 pr_err("Failed to switch to new file\n");
2666 trigger_error(&switch_output_trigger);
2671 /* re-arm the alarm */
2672 if (rec->switch_output.time)
2673 alarm(rec->switch_output.time);
2676 if (hits == thread->samples) {
2677 if (done || draining)
2679 err = fdarray__poll(&thread->pollfd, -1);
2681 * Propagate error, only if there's any. Ignore positive
2682 * number of returned events and interrupt error.
2684 if (err > 0 || (err < 0 && errno == EINTR))
2688 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2689 record__thread_munmap_filtered, NULL) == 0)
2692 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2697 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2699 case EVLIST_CTL_CMD_SNAPSHOT:
2700 hit_auxtrace_snapshot_trigger(rec);
2701 evlist__ctlfd_ack(rec->evlist);
2703 case EVLIST_CTL_CMD_STOP:
2706 case EVLIST_CTL_CMD_ACK:
2707 case EVLIST_CTL_CMD_UNSUPPORTED:
2708 case EVLIST_CTL_CMD_ENABLE:
2709 case EVLIST_CTL_CMD_DISABLE:
2710 case EVLIST_CTL_CMD_EVLIST:
2711 case EVLIST_CTL_CMD_PING:
2717 err = event_enable_timer__process(rec->evlist->eet);
2726 * When perf is starting the traced process, at the end events
2727 * die with the process and we wait for that. Thus no need to
2728 * disable events in this case.
2730 if (done && !disabled && !target__none(&opts->target)) {
2731 trigger_off(&auxtrace_snapshot_trigger);
2732 evlist__disable(rec->evlist);
2737 trigger_off(&auxtrace_snapshot_trigger);
2738 trigger_off(&switch_output_trigger);
2740 if (opts->auxtrace_snapshot_on_exit)
2741 record__auxtrace_snapshot_exit(rec);
2743 if (forks && workload_exec_errno) {
2744 char msg[STRERR_BUFSIZE], strevsels[2048];
2745 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2747 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2749 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2750 strevsels, argv[0], emsg);
2756 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2757 record__waking(rec));
2759 write_finished_init(rec, true);
2761 if (target__none(&rec->opts.target))
2762 record__synthesize_workload(rec, true);
2765 record__stop_threads(rec);
2766 record__mmap_read_all(rec, true);
2768 record__free_thread_data(rec);
2769 evlist__finalize_ctlfd(rec->evlist);
2770 record__aio_mmap_read_sync(rec);
2772 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2773 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2774 session->header.env.comp_ratio = ratio + 0.5;
2780 if (!child_finished)
2781 kill(rec->evlist->workload.pid, SIGTERM);
2787 else if (WIFEXITED(exit_status))
2788 status = WEXITSTATUS(exit_status);
2789 else if (WIFSIGNALED(exit_status))
2790 signr = WTERMSIG(exit_status);
2795 rec->bytes_written += off_cpu_write(rec->session);
2797 record__read_lost_samples(rec);
2798 record__synthesize(rec, true);
2799 /* this will be recalculated during process_buildids() */
2803 if (!rec->timestamp_filename) {
2804 record__finish_output(rec);
2806 fd = record__switch_output(rec, true);
2809 goto out_delete_session;
2814 perf_hooks__invoke_record_end();
2816 if (!err && !quiet) {
2818 const char *postfix = rec->timestamp_filename ?
2819 ".<timestamp>" : "";
2821 if (rec->samples && !rec->opts.full_auxtrace)
2822 scnprintf(samples, sizeof(samples),
2823 " (%" PRIu64 " samples)", rec->samples);
2827 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2828 perf_data__size(data) / 1024.0 / 1024.0,
2829 data->path, postfix, samples);
2831 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2832 rec->session->bytes_transferred / 1024.0 / 1024.0,
2835 fprintf(stderr, " ]\n");
2839 #ifdef HAVE_EVENTFD_SUPPORT
2847 zstd_fini(&session->zstd_data);
2848 perf_session__delete(session);
2850 if (!opts->no_bpf_event)
2851 evlist__stop_sb_thread(rec->sb_evlist);
2855 static void callchain_debug(struct callchain_param *callchain)
2857 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2859 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2861 if (callchain->record_mode == CALLCHAIN_DWARF)
2862 pr_debug("callchain: stack dump size %d\n",
2863 callchain->dump_size);
2866 int record_opts__parse_callchain(struct record_opts *record,
2867 struct callchain_param *callchain,
2868 const char *arg, bool unset)
2871 callchain->enabled = !unset;
2873 /* --no-call-graph */
2875 callchain->record_mode = CALLCHAIN_NONE;
2876 pr_debug("callchain: disabled\n");
2880 ret = parse_callchain_record_opt(arg, callchain);
2882 /* Enable data address sampling for DWARF unwind. */
2883 if (callchain->record_mode == CALLCHAIN_DWARF)
2884 record->sample_address = true;
2885 callchain_debug(callchain);
2891 int record_parse_callchain_opt(const struct option *opt,
2895 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2898 int record_callchain_opt(const struct option *opt,
2899 const char *arg __maybe_unused,
2900 int unset __maybe_unused)
2902 struct callchain_param *callchain = opt->value;
2904 callchain->enabled = true;
2906 if (callchain->record_mode == CALLCHAIN_NONE)
2907 callchain->record_mode = CALLCHAIN_FP;
2909 callchain_debug(callchain);
2913 static int perf_record_config(const char *var, const char *value, void *cb)
2915 struct record *rec = cb;
2917 if (!strcmp(var, "record.build-id")) {
2918 if (!strcmp(value, "cache"))
2919 rec->no_buildid_cache = false;
2920 else if (!strcmp(value, "no-cache"))
2921 rec->no_buildid_cache = true;
2922 else if (!strcmp(value, "skip"))
2923 rec->no_buildid = true;
2924 else if (!strcmp(value, "mmap"))
2925 rec->buildid_mmap = true;
2930 if (!strcmp(var, "record.call-graph")) {
2931 var = "call-graph.record-mode";
2932 return perf_default_config(var, value, cb);
2934 #ifdef HAVE_AIO_SUPPORT
2935 if (!strcmp(var, "record.aio")) {
2936 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2937 if (!rec->opts.nr_cblocks)
2938 rec->opts.nr_cblocks = nr_cblocks_default;
2941 if (!strcmp(var, "record.debuginfod")) {
2942 rec->debuginfod.urls = strdup(value);
2943 if (!rec->debuginfod.urls)
2945 rec->debuginfod.set = true;
2951 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2953 struct record *rec = (struct record *)opt->value;
2955 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2958 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2960 struct record_opts *opts = (struct record_opts *)opt->value;
2965 if (!strcasecmp(str, "node"))
2966 opts->affinity = PERF_AFFINITY_NODE;
2967 else if (!strcasecmp(str, "cpu"))
2968 opts->affinity = PERF_AFFINITY_CPU;
2973 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2975 mask->nbits = nr_bits;
2976 mask->bits = bitmap_zalloc(mask->nbits);
2983 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2985 bitmap_free(mask->bits);
2989 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2993 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2995 mask->affinity.bits = NULL;
2999 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3001 record__mmap_cpu_mask_free(&mask->maps);
3002 mask->maps.bits = NULL;
3008 static void record__thread_mask_free(struct thread_mask *mask)
3010 record__mmap_cpu_mask_free(&mask->maps);
3011 record__mmap_cpu_mask_free(&mask->affinity);
3014 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3017 struct record_opts *opts = opt->value;
3019 if (unset || !str || !strlen(str)) {
3020 opts->threads_spec = THREAD_SPEC__CPU;
3022 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3023 if (s == THREAD_SPEC__USER) {
3024 opts->threads_user_spec = strdup(str);
3025 if (!opts->threads_user_spec)
3027 opts->threads_spec = THREAD_SPEC__USER;
3030 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3031 opts->threads_spec = s;
3037 if (opts->threads_spec == THREAD_SPEC__USER)
3038 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3040 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3045 static int parse_output_max_size(const struct option *opt,
3046 const char *str, int unset)
3048 unsigned long *s = (unsigned long *)opt->value;
3049 static struct parse_tag tags_size[] = {
3050 { .tag = 'B', .mult = 1 },
3051 { .tag = 'K', .mult = 1 << 10 },
3052 { .tag = 'M', .mult = 1 << 20 },
3053 { .tag = 'G', .mult = 1 << 30 },
3063 val = parse_tag_value(str, tags_size);
3064 if (val != (unsigned long) -1) {
3072 static int record__parse_mmap_pages(const struct option *opt,
3074 int unset __maybe_unused)
3076 struct record_opts *opts = opt->value;
3078 unsigned int mmap_pages;
3093 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3096 opts->mmap_pages = mmap_pages;
3104 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3108 opts->auxtrace_mmap_pages = mmap_pages;
3115 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3119 static int parse_control_option(const struct option *opt,
3121 int unset __maybe_unused)
3123 struct record_opts *opts = opt->value;
3125 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3128 static void switch_output_size_warn(struct record *rec)
3130 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3131 struct switch_output *s = &rec->switch_output;
3135 if (s->size < wakeup_size) {
3138 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3139 pr_warning("WARNING: switch-output data size lower than "
3140 "wakeup kernel buffer size (%s) "
3141 "expect bigger perf.data sizes\n", buf);
3145 static int switch_output_setup(struct record *rec)
3147 struct switch_output *s = &rec->switch_output;
3148 static struct parse_tag tags_size[] = {
3149 { .tag = 'B', .mult = 1 },
3150 { .tag = 'K', .mult = 1 << 10 },
3151 { .tag = 'M', .mult = 1 << 20 },
3152 { .tag = 'G', .mult = 1 << 30 },
3155 static struct parse_tag tags_time[] = {
3156 { .tag = 's', .mult = 1 },
3157 { .tag = 'm', .mult = 60 },
3158 { .tag = 'h', .mult = 60*60 },
3159 { .tag = 'd', .mult = 60*60*24 },
3165 * If we're using --switch-output-events, then we imply its
3166 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3167 * thread to its parent.
3169 if (rec->switch_output_event_set) {
3170 if (record__threads_enabled(rec)) {
3171 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3180 if (record__threads_enabled(rec)) {
3181 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3185 if (!strcmp(s->str, "signal")) {
3188 pr_debug("switch-output with SIGUSR2 signal\n");
3192 val = parse_tag_value(s->str, tags_size);
3193 if (val != (unsigned long) -1) {
3195 pr_debug("switch-output with %s size threshold\n", s->str);
3199 val = parse_tag_value(s->str, tags_time);
3200 if (val != (unsigned long) -1) {
3202 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3210 rec->timestamp_filename = true;
3213 if (s->size && !rec->opts.no_buffering)
3214 switch_output_size_warn(rec);
3219 static const char * const __record_usage[] = {
3220 "perf record [<options>] [<command>]",
3221 "perf record [<options>] -- <command> [<options>]",
3224 const char * const *record_usage = __record_usage;
3226 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3227 struct perf_sample *sample, struct machine *machine)
3230 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3231 * no need to add them twice.
3233 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3235 return perf_event__process_mmap(tool, event, sample, machine);
3238 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3239 struct perf_sample *sample, struct machine *machine)
3242 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3243 * no need to add them twice.
3245 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3248 return perf_event__process_mmap2(tool, event, sample, machine);
3251 static int process_timestamp_boundary(struct perf_tool *tool,
3252 union perf_event *event __maybe_unused,
3253 struct perf_sample *sample,
3254 struct machine *machine __maybe_unused)
3256 struct record *rec = container_of(tool, struct record, tool);
3258 set_timestamp_boundary(rec, sample->time);
3262 static int parse_record_synth_option(const struct option *opt,
3264 int unset __maybe_unused)
3266 struct record_opts *opts = opt->value;
3267 char *p = strdup(str);
3272 opts->synth = parse_synth_opt(p);
3275 if (opts->synth < 0) {
3276 pr_err("Invalid synth option: %s\n", str);
3283 * XXX Ideally would be local to cmd_record() and passed to a record__new
3284 * because we need to have access to it in record__exit, that is called
3285 * after cmd_record() exits, but since record_options need to be accessible to
3286 * builtin-script, leave it here.
3288 * At least we don't ouch it in all the other functions here directly.
3290 * Just say no to tons of global variables, sigh.
3292 static struct record record = {
3294 .sample_time = true,
3295 .mmap_pages = UINT_MAX,
3296 .user_freq = UINT_MAX,
3297 .user_interval = ULLONG_MAX,
3301 .default_per_cpu = true,
3303 .mmap_flush = MMAP_FLUSH_DEFAULT,
3304 .nr_threads_synthesize = 1,
3307 .synth = PERF_SYNTH_ALL,
3310 .sample = process_sample_event,
3311 .fork = perf_event__process_fork,
3312 .exit = perf_event__process_exit,
3313 .comm = perf_event__process_comm,
3314 .namespaces = perf_event__process_namespaces,
3315 .mmap = build_id__process_mmap,
3316 .mmap2 = build_id__process_mmap2,
3317 .itrace_start = process_timestamp_boundary,
3318 .aux = process_timestamp_boundary,
3319 .ordered_events = true,
3323 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3324 "\n\t\t\t\tDefault: fp";
3326 static bool dry_run;
3329 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3330 * with it and switch to use the library functions in perf_evlist that came
3331 * from builtin-record.c, i.e. use record_opts,
3332 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3335 static struct option __record_options[] = {
3336 OPT_CALLBACK('e', "event", &record.evlist, "event",
3337 "event selector. use 'perf list' to list available events",
3338 parse_events_option),
3339 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3340 "event filter", parse_filter),
3341 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3342 NULL, "don't record events from perf itself",
3344 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3345 "record events on existing process id"),
3346 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3347 "record events on existing thread id"),
3348 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3349 "collect data with this RT SCHED_FIFO priority"),
3350 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3351 "collect data without buffering"),
3352 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3353 "collect raw sample records from all opened counters"),
3354 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3355 "system-wide collection from all CPUs"),
3356 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3357 "list of cpus to monitor"),
3358 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3359 OPT_STRING('o', "output", &record.data.path, "file",
3360 "output file name"),
3361 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3362 &record.opts.no_inherit_set,
3363 "child tasks do not inherit counters"),
3364 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3365 "synthesize non-sample events at the end of output"),
3366 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3367 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3368 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3369 "Fail if the specified frequency can't be used"),
3370 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3371 "profile at this frequency",
3372 record__parse_freq),
3373 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3374 "number of mmap data pages and AUX area tracing mmap pages",
3375 record__parse_mmap_pages),
3376 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3377 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3378 record__mmap_flush_parse),
3379 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3380 NULL, "enables call-graph recording" ,
3381 &record_callchain_opt),
3382 OPT_CALLBACK(0, "call-graph", &record.opts,
3383 "record_mode[,record_size]", record_callchain_help,
3384 &record_parse_callchain_opt),
3385 OPT_INCR('v', "verbose", &verbose,
3386 "be more verbose (show counter open errors, etc)"),
3387 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3388 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3389 "per thread counts"),
3390 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3391 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3392 "Record the sample physical addresses"),
3393 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3394 "Record the sampled data address data page size"),
3395 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3396 "Record the sampled code address (ip) page size"),
3397 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3398 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3399 "Record the sample identifier"),
3400 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3401 &record.opts.sample_time_set,
3402 "Record the sample timestamps"),
3403 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3404 "Record the sample period"),
3405 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3407 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3408 &record.no_buildid_cache_set,
3409 "do not update the buildid cache"),
3410 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3411 &record.no_buildid_set,
3412 "do not collect buildids in perf.data"),
3413 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3414 "monitor event in cgroup name only",
3416 OPT_CALLBACK('D', "delay", &record, "ms",
3417 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3418 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3419 record__parse_event_enable_time),
3420 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3421 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3424 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3425 "branch any", "sample any taken branches",
3426 parse_branch_stack),
3428 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3429 "branch filter mask", "branch stack filter modes",
3430 parse_branch_stack),
3431 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3432 "sample by weight (on special events only)"),
3433 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3434 "sample transaction flags (special events only)"),
3435 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3436 "use per-thread mmaps"),
3437 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3438 "sample selected machine registers on interrupt,"
3439 " use '-I?' to list register names", parse_intr_regs),
3440 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3441 "sample selected machine registers on interrupt,"
3442 " use '--user-regs=?' to list register names", parse_user_regs),
3443 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3444 "Record running/enabled time of read (:S) events"),
3445 OPT_CALLBACK('k', "clockid", &record.opts,
3446 "clockid", "clockid to use for events, see clock_gettime()",
3448 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3449 "opts", "AUX area tracing Snapshot Mode", ""),
3450 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3451 "opts", "sample AUX area", ""),
3452 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3453 "per thread proc mmap processing timeout in ms"),
3454 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3455 "Record namespaces events"),
3456 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3457 "Record cgroup events"),
3458 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3459 &record.opts.record_switch_events_set,
3460 "Record context switch events"),
3461 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3462 "Configure all used events to run in kernel space.",
3463 PARSE_OPT_EXCLUSIVE),
3464 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3465 "Configure all used events to run in user space.",
3466 PARSE_OPT_EXCLUSIVE),
3467 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3468 "collect kernel callchains"),
3469 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3470 "collect user callchains"),
3471 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3472 "clang binary to use for compiling BPF scriptlets"),
3473 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3474 "options passed to clang when compiling BPF scriptlets"),
3475 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3476 "file", "vmlinux pathname"),
3477 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3478 "Record build-id of all DSOs regardless of hits"),
3479 OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3480 "Record build-id in map events"),
3481 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3482 "append timestamp to output filename"),
3483 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3484 "Record timestamp boundary (time of first/last samples)"),
3485 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3486 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3487 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3489 OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3490 "switch output event selector. use 'perf list' to list available events",
3491 parse_events_option_new_evlist),
3492 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3493 "Limit number of switch output generated files"),
3494 OPT_BOOLEAN(0, "dry-run", &dry_run,
3495 "Parse options then exit"),
3496 #ifdef HAVE_AIO_SUPPORT
3497 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3498 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3501 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3502 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3503 record__parse_affinity),
3504 #ifdef HAVE_ZSTD_SUPPORT
3505 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3506 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3507 record__parse_comp_level),
3509 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3510 "size", "Limit the maximum size of the output file", parse_output_max_size),
3511 OPT_UINTEGER(0, "num-thread-synthesize",
3512 &record.opts.nr_threads_synthesize,
3513 "number of threads to run for event synthesis"),
3515 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3516 "libpfm4 event selector. use 'perf list' to list available events",
3517 parse_libpfm_events_option),
3519 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3520 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3521 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3522 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3523 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3524 parse_control_option),
3525 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3526 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3527 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3528 &record.debuginfod.set, "debuginfod urls",
3529 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3531 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3532 "write collected trace data into several data files using parallel threads",
3533 record__parse_threads),
3534 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3538 struct option *record_options = __record_options;
3540 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3542 struct perf_cpu cpu;
3545 if (cpu_map__is_dummy(cpus))
3548 perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3551 /* Return ENODEV is input cpu is greater than max cpu */
3552 if ((unsigned long)cpu.cpu > mask->nbits)
3554 __set_bit(cpu.cpu, mask->bits);
3560 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3562 struct perf_cpu_map *cpus;
3564 cpus = perf_cpu_map__new(mask_spec);
3568 bitmap_zero(mask->bits, mask->nbits);
3569 if (record__mmap_cpu_mask_init(mask, cpus))
3572 perf_cpu_map__put(cpus);
3577 static void record__free_thread_masks(struct record *rec, int nr_threads)
3581 if (rec->thread_masks)
3582 for (t = 0; t < nr_threads; t++)
3583 record__thread_mask_free(&rec->thread_masks[t]);
3585 zfree(&rec->thread_masks);
3588 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3592 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3593 if (!rec->thread_masks) {
3594 pr_err("Failed to allocate thread masks\n");
3598 for (t = 0; t < nr_threads; t++) {
3599 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3601 pr_err("Failed to allocate thread masks[%d]\n", t);
3609 record__free_thread_masks(rec, nr_threads);
3614 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3616 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3618 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3622 rec->nr_threads = nr_cpus;
3623 pr_debug("nr_threads: %d\n", rec->nr_threads);
3625 for (t = 0; t < rec->nr_threads; t++) {
3626 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3627 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3629 pr_debug("thread_masks[%d]: ", t);
3630 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3631 pr_debug("thread_masks[%d]: ", t);
3632 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3639 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3640 const char **maps_spec, const char **affinity_spec,
3645 struct mmap_cpu_mask cpus_mask;
3646 struct thread_mask thread_mask, full_mask, *thread_masks;
3648 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3650 pr_err("Failed to allocate CPUs mask\n");
3654 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3656 pr_err("Failed to init cpu mask\n");
3657 goto out_free_cpu_mask;
3660 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3662 pr_err("Failed to allocate full mask\n");
3663 goto out_free_cpu_mask;
3666 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3668 pr_err("Failed to allocate thread mask\n");
3669 goto out_free_full_and_cpu_masks;
3672 for (s = 0; s < nr_spec; s++) {
3673 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3675 pr_err("Failed to initialize maps thread mask\n");
3678 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3680 pr_err("Failed to initialize affinity thread mask\n");
3684 /* ignore invalid CPUs but do not allow empty masks */
3685 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3686 cpus_mask.bits, thread_mask.maps.nbits)) {
3687 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3691 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3692 cpus_mask.bits, thread_mask.affinity.nbits)) {
3693 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3698 /* do not allow intersection with other masks (full_mask) */
3699 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3700 thread_mask.maps.nbits)) {
3701 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3705 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3706 thread_mask.affinity.nbits)) {
3707 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3712 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3713 thread_mask.maps.bits, full_mask.maps.nbits);
3714 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3715 thread_mask.affinity.bits, full_mask.maps.nbits);
3717 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3718 if (!thread_masks) {
3719 pr_err("Failed to reallocate thread masks\n");
3723 rec->thread_masks = thread_masks;
3724 rec->thread_masks[t] = thread_mask;
3726 pr_debug("thread_masks[%d]: ", t);
3727 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3728 pr_debug("thread_masks[%d]: ", t);
3729 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3732 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3734 pr_err("Failed to allocate thread mask\n");
3735 goto out_free_full_and_cpu_masks;
3738 rec->nr_threads = t;
3739 pr_debug("nr_threads: %d\n", rec->nr_threads);
3740 if (!rec->nr_threads)
3744 record__thread_mask_free(&thread_mask);
3745 out_free_full_and_cpu_masks:
3746 record__thread_mask_free(&full_mask);
3748 record__mmap_cpu_mask_free(&cpus_mask);
3753 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3756 struct cpu_topology *topo;
3758 topo = cpu_topology__new();
3760 pr_err("Failed to allocate CPU topology\n");
3764 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3765 topo->core_cpus_list, topo->core_cpus_lists);
3766 cpu_topology__delete(topo);
3771 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3774 struct cpu_topology *topo;
3776 topo = cpu_topology__new();
3778 pr_err("Failed to allocate CPU topology\n");
3782 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3783 topo->package_cpus_list, topo->package_cpus_lists);
3784 cpu_topology__delete(topo);
3789 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3794 struct numa_topology *topo;
3796 topo = numa_topology__new();
3798 pr_err("Failed to allocate NUMA topology\n");
3802 spec = zalloc(topo->nr * sizeof(char *));
3804 pr_err("Failed to allocate NUMA spec\n");
3806 goto out_delete_topo;
3808 for (s = 0; s < topo->nr; s++)
3809 spec[s] = topo->nodes[s].cpus;
3811 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3816 numa_topology__delete(topo);
3821 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3825 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3826 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3828 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3829 spec = strtok_r(user_spec, ":", &spec_ptr);
3832 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3833 mask = strtok_r(spec, "/", &mask_ptr);
3836 pr_debug2(" maps mask: %s\n", mask);
3837 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3839 pr_err("Failed to reallocate maps spec\n");
3843 maps_spec = tmp_spec;
3844 maps_spec[nr_spec] = dup_mask = strdup(mask);
3845 if (!maps_spec[nr_spec]) {
3846 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3850 mask = strtok_r(NULL, "/", &mask_ptr);
3852 pr_err("Invalid thread maps or affinity specs\n");
3856 pr_debug2(" affinity mask: %s\n", mask);
3857 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3859 pr_err("Failed to reallocate affinity spec\n");
3863 affinity_spec = tmp_spec;
3864 affinity_spec[nr_spec] = strdup(mask);
3865 if (!affinity_spec[nr_spec]) {
3866 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3874 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3875 (const char **)affinity_spec, nr_spec);
3879 for (s = 0; s < nr_spec; s++) {
3883 free(affinity_spec[s]);
3885 free(affinity_spec);
3891 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3895 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3899 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3902 rec->nr_threads = 1;
3907 static int record__init_thread_masks(struct record *rec)
3910 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3912 if (!record__threads_enabled(rec))
3913 return record__init_thread_default_masks(rec, cpus);
3915 if (evlist__per_thread(rec->evlist)) {
3916 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3920 switch (rec->opts.threads_spec) {
3921 case THREAD_SPEC__CPU:
3922 ret = record__init_thread_cpu_masks(rec, cpus);
3924 case THREAD_SPEC__CORE:
3925 ret = record__init_thread_core_masks(rec, cpus);
3927 case THREAD_SPEC__PACKAGE:
3928 ret = record__init_thread_package_masks(rec, cpus);
3930 case THREAD_SPEC__NUMA:
3931 ret = record__init_thread_numa_masks(rec, cpus);
3933 case THREAD_SPEC__USER:
3934 ret = record__init_thread_user_masks(rec, cpus);
3943 int cmd_record(int argc, const char **argv)
3946 struct record *rec = &record;
3947 char errbuf[BUFSIZ];
3949 setlocale(LC_ALL, "");
3951 #ifndef HAVE_LIBBPF_SUPPORT
3952 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3953 set_nobuild('\0', "clang-path", true);
3954 set_nobuild('\0', "clang-opt", true);
3958 #ifndef HAVE_BPF_PROLOGUE
3959 # if !defined (HAVE_DWARF_SUPPORT)
3960 # define REASON "NO_DWARF=1"
3961 # elif !defined (HAVE_LIBBPF_SUPPORT)
3962 # define REASON "NO_LIBBPF=1"
3964 # define REASON "this architecture doesn't support BPF prologue"
3966 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3967 set_nobuild('\0', "vmlinux", true);
3972 #ifndef HAVE_BPF_SKEL
3973 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3974 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3978 rec->opts.affinity = PERF_AFFINITY_SYS;
3980 rec->evlist = evlist__new();
3981 if (rec->evlist == NULL)
3984 err = perf_config(perf_record_config, rec);
3988 argc = parse_options(argc, argv, record_options, record_usage,
3989 PARSE_OPT_STOP_AT_NON_OPTION);
3991 perf_quiet_option();
3993 err = symbol__validate_sym_arguments();
3997 perf_debuginfod_setup(&record.debuginfod);
3999 /* Make system wide (-a) the default target. */
4000 if (!argc && target__none(&rec->opts.target))
4001 rec->opts.target.system_wide = true;
4003 if (nr_cgroups && !rec->opts.target.system_wide) {
4004 usage_with_options_msg(record_usage, record_options,
4005 "cgroup monitoring only available in system-wide mode");
4009 if (rec->buildid_mmap) {
4010 if (!perf_can_record_build_id()) {
4011 pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4015 pr_debug("Enabling build id in mmap2 events.\n");
4016 /* Enable mmap build id synthesizing. */
4017 symbol_conf.buildid_mmap2 = true;
4018 /* Enable perf_event_attr::build_id bit. */
4019 rec->opts.build_id = true;
4020 /* Disable build id cache. */
4021 rec->no_buildid = true;
4024 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4025 pr_err("Kernel has no cgroup sampling support.\n");
4030 if (rec->opts.kcore)
4031 rec->opts.text_poke = true;
4033 if (rec->opts.kcore || record__threads_enabled(rec))
4034 rec->data.is_dir = true;
4036 if (record__threads_enabled(rec)) {
4037 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4038 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4041 if (record__aio_enabled(rec)) {
4042 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4047 if (rec->opts.comp_level != 0) {
4048 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4049 rec->no_buildid = true;
4052 if (rec->opts.record_switch_events &&
4053 !perf_can_record_switch_events()) {
4054 ui__error("kernel does not support recording context switch events\n");
4055 parse_options_usage(record_usage, record_options, "switch-events", 0);
4060 if (switch_output_setup(rec)) {
4061 parse_options_usage(record_usage, record_options, "switch-output", 0);
4066 if (rec->switch_output.time) {
4067 signal(SIGALRM, alarm_sig_handler);
4068 alarm(rec->switch_output.time);
4071 if (rec->switch_output.num_files) {
4072 rec->switch_output.filenames = calloc(sizeof(char *),
4073 rec->switch_output.num_files);
4074 if (!rec->switch_output.filenames) {
4080 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4081 rec->timestamp_filename = false;
4082 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4086 * Allow aliases to facilitate the lookup of symbols for address
4087 * filters. Refer to auxtrace_parse_filters().
4089 symbol_conf.allow_aliases = true;
4093 err = record__auxtrace_init(rec);
4100 err = bpf__setup_stdout(rec->evlist);
4102 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4103 pr_err("ERROR: Setup BPF stdout failed: %s\n",
4110 if (rec->no_buildid_cache || rec->no_buildid) {
4111 disable_buildid_cache();
4112 } else if (rec->switch_output.enabled) {
4114 * In 'perf record --switch-output', disable buildid
4115 * generation by default to reduce data file switching
4116 * overhead. Still generate buildid if they are required
4119 * perf record --switch-output --no-no-buildid \
4120 * --no-no-buildid-cache
4122 * Following code equals to:
4124 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4125 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4126 * disable_buildid_cache();
4128 bool disable = true;
4130 if (rec->no_buildid_set && !rec->no_buildid)
4132 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4135 rec->no_buildid = true;
4136 rec->no_buildid_cache = true;
4137 disable_buildid_cache();
4141 if (record.opts.overwrite)
4142 record.opts.tail_synthesize = true;
4144 if (rec->evlist->core.nr_entries == 0) {
4145 if (perf_pmu__has_hybrid()) {
4146 err = evlist__add_default_hybrid(rec->evlist,
4147 !record.opts.no_samples);
4149 err = __evlist__add_default(rec->evlist,
4150 !record.opts.no_samples);
4154 pr_err("Not enough memory for event selector list\n");
4159 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4160 rec->opts.no_inherit = true;
4162 err = target__validate(&rec->opts.target);
4164 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4165 ui__warning("%s\n", errbuf);
4168 err = target__parse_uid(&rec->opts.target);
4170 int saved_errno = errno;
4172 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4173 ui__error("%s", errbuf);
4179 /* Enable ignoring missing threads when -u/-p option is defined. */
4180 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4182 if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4183 pr_err("failed to use cpu list %s\n",
4184 rec->opts.target.cpu_list);
4188 rec->opts.target.hybrid = perf_pmu__has_hybrid();
4190 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4191 arch__add_leaf_frame_record_opts(&rec->opts);
4194 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4195 if (rec->opts.target.pid != NULL) {
4196 pr_err("Couldn't create thread/CPU maps: %s\n",
4197 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4201 usage_with_options(record_usage, record_options);
4204 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4209 * We take all buildids when the file contains
4210 * AUX area tracing data because we do not decode the
4211 * trace because it would take too long.
4213 if (rec->opts.full_auxtrace)
4214 rec->buildid_all = true;
4216 if (rec->opts.text_poke) {
4217 err = record__config_text_poke(rec->evlist);
4219 pr_err("record__config_text_poke failed, error %d\n", err);
4225 err = record__config_off_cpu(rec);
4227 pr_err("record__config_off_cpu failed, error %d\n", err);
4232 if (record_opts__config(&rec->opts)) {
4237 err = record__init_thread_masks(rec);
4239 pr_err("Failed to initialize parallel data streaming masks\n");
4243 if (rec->opts.nr_cblocks > nr_cblocks_max)
4244 rec->opts.nr_cblocks = nr_cblocks_max;
4245 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4247 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4248 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4250 if (rec->opts.comp_level > comp_level_max)
4251 rec->opts.comp_level = comp_level_max;
4252 pr_debug("comp level: %d\n", rec->opts.comp_level);
4254 err = __cmd_record(&record, argc, argv);
4256 evlist__delete(rec->evlist);
4258 auxtrace_record__free(rec->itr);
4260 record__free_thread_masks(rec, rec->nr_threads);
4261 rec->nr_threads = 0;
4262 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4266 static void snapshot_sig_handler(int sig __maybe_unused)
4268 struct record *rec = &record;
4270 hit_auxtrace_snapshot_trigger(rec);
4272 if (switch_output_signal(rec))
4273 trigger_hit(&switch_output_trigger);
4276 static void alarm_sig_handler(int sig __maybe_unused)
4278 struct record *rec = &record;
4280 if (switch_output_time(rec))
4281 trigger_hit(&switch_output_trigger);