1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
75 #include <sys/types.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
85 struct switch_output {
98 struct mmap_cpu_mask maps;
99 struct mmap_cpu_mask affinity;
102 struct record_thread {
104 struct thread_mask *mask;
109 struct fdarray pollfd;
113 struct mmap **overwrite_maps;
115 unsigned long long samples;
116 unsigned long waking;
118 u64 bytes_transferred;
119 u64 bytes_compressed;
122 static __thread struct record_thread *thread;
125 THREAD_MSG__UNDEFINED = 0,
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
135 THREAD_SPEC__UNDEFINED = 0,
138 THREAD_SPEC__PACKAGE,
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 "undefined", "cpu", "core", "package", "numa", "user"
148 struct pollfd_index_map {
149 int evlist_pollfd_index;
150 int thread_pollfd_index;
154 struct perf_tool tool;
155 struct record_opts opts;
157 struct perf_data data;
158 struct auxtrace_record *itr;
159 struct evlist *evlist;
160 struct perf_session *session;
161 struct evlist *sb_evlist;
164 bool switch_output_event_set;
167 bool no_buildid_cache;
168 bool no_buildid_cache_set;
171 bool timestamp_filename;
172 bool timestamp_boundary;
174 struct switch_output switch_output;
175 unsigned long long samples;
176 unsigned long output_max_size; /* = 0: unlimited */
177 struct perf_debuginfod debuginfod;
179 struct thread_mask *thread_masks;
180 struct record_thread *thread_data;
181 struct pollfd_index_map *index_map;
183 size_t index_map_cnt;
186 static volatile int done;
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
197 static inline pid_t gettid(void)
199 return (pid_t)syscall(__NR_gettid);
203 static int record__threads_enabled(struct record *rec)
205 return rec->opts.threads_spec;
208 static bool switch_output_signal(struct record *rec)
210 return rec->switch_output.signal &&
211 trigger_is_ready(&switch_output_trigger);
214 static bool switch_output_size(struct record *rec)
216 return rec->switch_output.size &&
217 trigger_is_ready(&switch_output_trigger) &&
218 (rec->bytes_written >= rec->switch_output.size);
221 static bool switch_output_time(struct record *rec)
223 return rec->switch_output.time &&
224 trigger_is_ready(&switch_output_trigger);
227 static u64 record__bytes_written(struct record *rec)
230 u64 bytes_written = rec->bytes_written;
231 struct record_thread *thread_data = rec->thread_data;
233 for (t = 0; t < rec->nr_threads; t++)
234 bytes_written += thread_data[t].bytes_written;
236 return bytes_written;
239 static bool record__output_max_size_exceeded(struct record *rec)
241 return rec->output_max_size &&
242 (record__bytes_written(rec) >= rec->output_max_size);
245 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
246 void *bf, size_t size)
248 struct perf_data_file *file = &rec->session->data->file;
250 if (map && map->file)
253 if (perf_data_file__write(file, bf, size) < 0) {
254 pr_err("failed to write perf data, error: %m\n");
258 if (map && map->file)
259 thread->bytes_written += size;
261 rec->bytes_written += size;
263 if (record__output_max_size_exceeded(rec) && !done) {
264 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
265 " stopping session ]\n",
266 record__bytes_written(rec) >> 10);
270 if (switch_output_size(rec))
271 trigger_hit(&switch_output_trigger);
276 static int record__aio_enabled(struct record *rec);
277 static int record__comp_enabled(struct record *rec);
278 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
279 void *dst, size_t dst_size, void *src, size_t src_size);
281 #ifdef HAVE_AIO_SUPPORT
282 static int record__aio_write(struct aiocb *cblock, int trace_fd,
283 void *buf, size_t size, off_t off)
287 cblock->aio_fildes = trace_fd;
288 cblock->aio_buf = buf;
289 cblock->aio_nbytes = size;
290 cblock->aio_offset = off;
291 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
294 rc = aio_write(cblock);
297 } else if (errno != EAGAIN) {
298 cblock->aio_fildes = -1;
299 pr_err("failed to queue perf data, error: %m\n");
307 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
313 ssize_t aio_ret, written;
315 aio_errno = aio_error(cblock);
316 if (aio_errno == EINPROGRESS)
319 written = aio_ret = aio_return(cblock);
321 if (aio_errno != EINTR)
322 pr_err("failed to write perf data, error: %m\n");
326 rem_size = cblock->aio_nbytes - written;
329 cblock->aio_fildes = -1;
331 * md->refcount is incremented in record__aio_pushfn() for
332 * every aio write request started in record__aio_push() so
333 * decrement it because the request is now complete.
335 perf_mmap__put(&md->core);
339 * aio write request may require restart with the
340 * reminder if the kernel didn't write whole
343 rem_off = cblock->aio_offset + written;
344 rem_buf = (void *)(cblock->aio_buf + written);
345 record__aio_write(cblock, cblock->aio_fildes,
346 rem_buf, rem_size, rem_off);
353 static int record__aio_sync(struct mmap *md, bool sync_all)
355 struct aiocb **aiocb = md->aio.aiocb;
356 struct aiocb *cblocks = md->aio.cblocks;
357 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
362 for (i = 0; i < md->aio.nr_cblocks; ++i) {
363 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
370 * Started aio write is not complete yet
371 * so it has to be waited before the
374 aiocb[i] = &cblocks[i];
381 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
382 if (!(errno == EAGAIN || errno == EINTR))
383 pr_err("failed to sync perf data, error: %m\n");
394 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
396 struct record_aio *aio = to;
399 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
400 * to release space in the kernel buffer as fast as possible, calling
401 * perf_mmap__consume() from perf_mmap__push() function.
403 * That lets the kernel to proceed with storing more profiling data into
404 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
406 * Coping can be done in two steps in case the chunk of profiling data
407 * crosses the upper bound of the kernel buffer. In this case we first move
408 * part of data from map->start till the upper bound and then the reminder
409 * from the beginning of the kernel buffer till the end of the data chunk.
412 if (record__comp_enabled(aio->rec)) {
413 size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
414 mmap__mmap_len(map) - aio->size,
417 memcpy(aio->data + aio->size, buf, size);
422 * Increment map->refcount to guard map->aio.data[] buffer
423 * from premature deallocation because map object can be
424 * released earlier than aio write request started on
425 * map->aio.data[] buffer is complete.
427 * perf_mmap__put() is done at record__aio_complete()
428 * after started aio request completion or at record__aio_push()
429 * if the request failed to start.
431 perf_mmap__get(&map->core);
439 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
442 int trace_fd = rec->session->data->file.fd;
443 struct record_aio aio = { .rec = rec, .size = 0 };
446 * Call record__aio_sync() to wait till map->aio.data[] buffer
447 * becomes available after previous aio write operation.
450 idx = record__aio_sync(map, false);
451 aio.data = map->aio.data[idx];
452 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
453 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
457 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
460 rec->bytes_written += aio.size;
461 if (switch_output_size(rec))
462 trigger_hit(&switch_output_trigger);
465 * Decrement map->refcount incremented in record__aio_pushfn()
466 * back if record__aio_write() operation failed to start, otherwise
467 * map->refcount is decremented in record__aio_complete() after
468 * aio write operation finishes successfully.
470 perf_mmap__put(&map->core);
476 static off_t record__aio_get_pos(int trace_fd)
478 return lseek(trace_fd, 0, SEEK_CUR);
481 static void record__aio_set_pos(int trace_fd, off_t pos)
483 lseek(trace_fd, pos, SEEK_SET);
486 static void record__aio_mmap_read_sync(struct record *rec)
489 struct evlist *evlist = rec->evlist;
490 struct mmap *maps = evlist->mmap;
492 if (!record__aio_enabled(rec))
495 for (i = 0; i < evlist->core.nr_mmaps; i++) {
496 struct mmap *map = &maps[i];
499 record__aio_sync(map, true);
503 static int nr_cblocks_default = 1;
504 static int nr_cblocks_max = 4;
506 static int record__aio_parse(const struct option *opt,
510 struct record_opts *opts = (struct record_opts *)opt->value;
513 opts->nr_cblocks = 0;
516 opts->nr_cblocks = strtol(str, NULL, 0);
517 if (!opts->nr_cblocks)
518 opts->nr_cblocks = nr_cblocks_default;
523 #else /* HAVE_AIO_SUPPORT */
524 static int nr_cblocks_max = 0;
526 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
527 off_t *off __maybe_unused)
532 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
537 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
541 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
546 static int record__aio_enabled(struct record *rec)
548 return rec->opts.nr_cblocks > 0;
551 #define MMAP_FLUSH_DEFAULT 1
552 static int record__mmap_flush_parse(const struct option *opt,
557 struct record_opts *opts = (struct record_opts *)opt->value;
558 static struct parse_tag tags[] = {
559 { .tag = 'B', .mult = 1 },
560 { .tag = 'K', .mult = 1 << 10 },
561 { .tag = 'M', .mult = 1 << 20 },
562 { .tag = 'G', .mult = 1 << 30 },
570 opts->mmap_flush = parse_tag_value(str, tags);
571 if (opts->mmap_flush == (int)-1)
572 opts->mmap_flush = strtol(str, NULL, 0);
575 if (!opts->mmap_flush)
576 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
578 flush_max = evlist__mmap_size(opts->mmap_pages);
580 if (opts->mmap_flush > flush_max)
581 opts->mmap_flush = flush_max;
586 #ifdef HAVE_ZSTD_SUPPORT
587 static unsigned int comp_level_default = 1;
589 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
591 struct record_opts *opts = opt->value;
594 opts->comp_level = 0;
597 opts->comp_level = strtol(str, NULL, 0);
598 if (!opts->comp_level)
599 opts->comp_level = comp_level_default;
605 static unsigned int comp_level_max = 22;
607 static int record__comp_enabled(struct record *rec)
609 return rec->opts.comp_level > 0;
612 static int process_synthesized_event(struct perf_tool *tool,
613 union perf_event *event,
614 struct perf_sample *sample __maybe_unused,
615 struct machine *machine __maybe_unused)
617 struct record *rec = container_of(tool, struct record, tool);
618 return record__write(rec, NULL, event, event->header.size);
621 static struct mutex synth_lock;
623 static int process_locked_synthesized_event(struct perf_tool *tool,
624 union perf_event *event,
625 struct perf_sample *sample __maybe_unused,
626 struct machine *machine __maybe_unused)
630 mutex_lock(&synth_lock);
631 ret = process_synthesized_event(tool, event, sample, machine);
632 mutex_unlock(&synth_lock);
636 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
638 struct record *rec = to;
640 if (record__comp_enabled(rec)) {
641 size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
646 return record__write(rec, map, bf, size);
649 static volatile int signr = -1;
650 static volatile int child_finished;
651 #ifdef HAVE_EVENTFD_SUPPORT
652 static volatile int done_fd = -1;
655 static void sig_handler(int sig)
663 #ifdef HAVE_EVENTFD_SUPPORT
666 int orig_errno = errno;
669 * It is possible for this signal handler to run after done is
670 * checked in the main loop, but before the perf counter fds are
671 * polled. If this happens, the poll() will continue to wait
672 * even though done is set, and will only break out if either
673 * another signal is received, or the counters are ready for
674 * read. To ensure the poll() doesn't sleep when done is set,
675 * use an eventfd (done_fd) to wake up the poll().
677 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
678 pr_err("failed to signal wakeup fd, error: %m\n");
682 #endif // HAVE_EVENTFD_SUPPORT
685 static void sigsegv_handler(int sig)
687 perf_hooks__recover();
688 sighandler_dump_stack(sig);
691 static void record__sig_exit(void)
696 signal(signr, SIG_DFL);
700 #ifdef HAVE_AUXTRACE_SUPPORT
702 static int record__process_auxtrace(struct perf_tool *tool,
704 union perf_event *event, void *data1,
705 size_t len1, void *data2, size_t len2)
707 struct record *rec = container_of(tool, struct record, tool);
708 struct perf_data *data = &rec->data;
712 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
714 int fd = perf_data__fd(data);
717 file_offset = lseek(fd, 0, SEEK_CUR);
718 if (file_offset == -1)
720 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
727 padding = (len1 + len2) & 7;
729 padding = 8 - padding;
731 record__write(rec, map, event, event->header.size);
732 record__write(rec, map, data1, len1);
734 record__write(rec, map, data2, len2);
735 record__write(rec, map, &pad, padding);
740 static int record__auxtrace_mmap_read(struct record *rec,
745 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
746 record__process_auxtrace);
756 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
761 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
762 record__process_auxtrace,
763 rec->opts.auxtrace_snapshot_size);
773 static int record__auxtrace_read_snapshot_all(struct record *rec)
778 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
779 struct mmap *map = &rec->evlist->mmap[i];
781 if (!map->auxtrace_mmap.base)
784 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
793 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
795 pr_debug("Recording AUX area tracing snapshot\n");
796 if (record__auxtrace_read_snapshot_all(rec) < 0) {
797 trigger_error(&auxtrace_snapshot_trigger);
799 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
800 trigger_error(&auxtrace_snapshot_trigger);
802 trigger_ready(&auxtrace_snapshot_trigger);
806 static int record__auxtrace_snapshot_exit(struct record *rec)
808 if (trigger_is_error(&auxtrace_snapshot_trigger))
811 if (!auxtrace_record__snapshot_started &&
812 auxtrace_record__snapshot_start(rec->itr))
815 record__read_auxtrace_snapshot(rec, true);
816 if (trigger_is_error(&auxtrace_snapshot_trigger))
822 static int record__auxtrace_init(struct record *rec)
826 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
827 && record__threads_enabled(rec)) {
828 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
833 rec->itr = auxtrace_record__init(rec->evlist, &err);
838 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
839 rec->opts.auxtrace_snapshot_opts);
843 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
844 rec->opts.auxtrace_sample_opts);
848 auxtrace_regroup_aux_output(rec->evlist);
850 return auxtrace_parse_filters(rec->evlist);
856 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
857 struct mmap *map __maybe_unused)
863 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
864 bool on_exit __maybe_unused)
869 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
880 static int record__auxtrace_init(struct record *rec __maybe_unused)
887 static int record__config_text_poke(struct evlist *evlist)
891 /* Nothing to do if text poke is already configured */
892 evlist__for_each_entry(evlist, evsel) {
893 if (evsel->core.attr.text_poke)
897 evsel = evlist__add_dummy_on_all_cpus(evlist);
901 evsel->core.attr.text_poke = 1;
902 evsel->core.attr.ksymbol = 1;
903 evsel->immediate = true;
904 evsel__set_sample_bit(evsel, TIME);
909 static int record__config_off_cpu(struct record *rec)
911 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
914 static bool record__kcore_readable(struct machine *machine)
916 char kcore[PATH_MAX];
919 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
921 fd = open(kcore, O_RDONLY);
930 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
932 char from_dir[PATH_MAX];
933 char kcore_dir[PATH_MAX];
936 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
938 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
942 return kcore_copy(from_dir, kcore_dir);
945 static void record__thread_data_init_pipes(struct record_thread *thread_data)
947 thread_data->pipes.msg[0] = -1;
948 thread_data->pipes.msg[1] = -1;
949 thread_data->pipes.ack[0] = -1;
950 thread_data->pipes.ack[1] = -1;
953 static int record__thread_data_open_pipes(struct record_thread *thread_data)
955 if (pipe(thread_data->pipes.msg))
958 if (pipe(thread_data->pipes.ack)) {
959 close(thread_data->pipes.msg[0]);
960 thread_data->pipes.msg[0] = -1;
961 close(thread_data->pipes.msg[1]);
962 thread_data->pipes.msg[1] = -1;
966 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
967 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
968 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
973 static void record__thread_data_close_pipes(struct record_thread *thread_data)
975 if (thread_data->pipes.msg[0] != -1) {
976 close(thread_data->pipes.msg[0]);
977 thread_data->pipes.msg[0] = -1;
979 if (thread_data->pipes.msg[1] != -1) {
980 close(thread_data->pipes.msg[1]);
981 thread_data->pipes.msg[1] = -1;
983 if (thread_data->pipes.ack[0] != -1) {
984 close(thread_data->pipes.ack[0]);
985 thread_data->pipes.ack[0] = -1;
987 if (thread_data->pipes.ack[1] != -1) {
988 close(thread_data->pipes.ack[1]);
989 thread_data->pipes.ack[1] = -1;
993 static bool evlist__per_thread(struct evlist *evlist)
995 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
998 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1000 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1001 struct mmap *mmap = evlist->mmap;
1002 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1003 struct perf_cpu_map *cpus = evlist->core.all_cpus;
1004 bool per_thread = evlist__per_thread(evlist);
1007 thread_data->nr_mmaps = nr_mmaps;
1009 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1010 thread_data->mask->maps.nbits);
1012 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 if (!thread_data->maps)
1016 if (overwrite_mmap) {
1017 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1018 if (!thread_data->overwrite_maps) {
1019 zfree(&thread_data->maps);
1023 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1024 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1026 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1028 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1029 if (thread_data->maps) {
1030 thread_data->maps[tm] = &mmap[m];
1031 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1032 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1034 if (thread_data->overwrite_maps) {
1035 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1036 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1037 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1046 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1049 struct mmap *map, *overwrite_map;
1051 fdarray__init(&thread_data->pollfd, 64);
1053 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1054 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1055 overwrite_map = thread_data->overwrite_maps ?
1056 thread_data->overwrite_maps[tm] : NULL;
1058 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1059 void *ptr = evlist->core.pollfd.priv[f].ptr;
1061 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1062 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1063 &evlist->core.pollfd);
1066 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1067 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1075 static void record__free_thread_data(struct record *rec)
1078 struct record_thread *thread_data = rec->thread_data;
1080 if (thread_data == NULL)
1083 for (t = 0; t < rec->nr_threads; t++) {
1084 record__thread_data_close_pipes(&thread_data[t]);
1085 zfree(&thread_data[t].maps);
1086 zfree(&thread_data[t].overwrite_maps);
1087 fdarray__exit(&thread_data[t].pollfd);
1090 zfree(&rec->thread_data);
1093 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1094 int evlist_pollfd_index,
1095 int thread_pollfd_index)
1097 size_t x = rec->index_map_cnt;
1099 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1101 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1102 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1103 rec->index_map_cnt += 1;
1107 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1108 struct evlist *evlist,
1109 struct record_thread *thread_data)
1111 struct pollfd *e_entries = evlist->core.pollfd.entries;
1112 struct pollfd *t_entries = thread_data->pollfd.entries;
1116 for (i = 0; i < rec->index_map_cnt; i++) {
1117 int e_pos = rec->index_map[i].evlist_pollfd_index;
1118 int t_pos = rec->index_map[i].thread_pollfd_index;
1120 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1121 e_entries[e_pos].events != t_entries[t_pos].events) {
1122 pr_err("Thread and evlist pollfd index mismatch\n");
1126 e_entries[e_pos].revents = t_entries[t_pos].revents;
1131 static int record__dup_non_perf_events(struct record *rec,
1132 struct evlist *evlist,
1133 struct record_thread *thread_data)
1135 struct fdarray *fda = &evlist->core.pollfd;
1138 for (i = 0; i < fda->nr; i++) {
1139 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1141 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1143 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1146 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1147 thread_data, ret, fda->entries[i].fd);
1148 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1150 pr_err("Failed to map thread and evlist pollfd indexes\n");
1157 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1160 struct record_thread *thread_data;
1162 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1163 if (!rec->thread_data) {
1164 pr_err("Failed to allocate thread data\n");
1167 thread_data = rec->thread_data;
1169 for (t = 0; t < rec->nr_threads; t++)
1170 record__thread_data_init_pipes(&thread_data[t]);
1172 for (t = 0; t < rec->nr_threads; t++) {
1173 thread_data[t].rec = rec;
1174 thread_data[t].mask = &rec->thread_masks[t];
1175 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1177 pr_err("Failed to initialize thread[%d] maps\n", t);
1180 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1182 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1186 thread_data[t].tid = -1;
1187 ret = record__thread_data_open_pipes(&thread_data[t]);
1189 pr_err("Failed to open thread[%d] communication pipes\n", t);
1192 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1193 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1195 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1198 thread_data[t].ctlfd_pos = ret;
1199 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1200 thread_data, thread_data[t].ctlfd_pos,
1201 thread_data[t].pipes.msg[0]);
1203 thread_data[t].tid = gettid();
1205 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1209 thread_data[t].ctlfd_pos = -1; /* Not used */
1216 record__free_thread_data(rec);
1221 static int record__mmap_evlist(struct record *rec,
1222 struct evlist *evlist)
1225 struct record_opts *opts = &rec->opts;
1226 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1227 opts->auxtrace_sample_mode;
1230 if (opts->affinity != PERF_AFFINITY_SYS)
1231 cpu__setup_cpunode_map();
1233 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1234 opts->auxtrace_mmap_pages,
1236 opts->nr_cblocks, opts->affinity,
1237 opts->mmap_flush, opts->comp_level) < 0) {
1238 if (errno == EPERM) {
1239 pr_err("Permission error mapping pages.\n"
1240 "Consider increasing "
1241 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1242 "or try again with a smaller value of -m/--mmap_pages.\n"
1243 "(current value: %u,%u)\n",
1244 opts->mmap_pages, opts->auxtrace_mmap_pages);
1247 pr_err("failed to mmap with %d (%s)\n", errno,
1248 str_error_r(errno, msg, sizeof(msg)));
1256 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1259 ret = record__alloc_thread_data(rec, evlist);
1263 if (record__threads_enabled(rec)) {
1264 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1266 pr_err("Failed to create data directory: %s\n", strerror(-ret));
1269 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1271 evlist->mmap[i].file = &rec->data.dir.files[i];
1272 if (evlist->overwrite_mmap)
1273 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1280 static int record__mmap(struct record *rec)
1282 return record__mmap_evlist(rec, rec->evlist);
1285 static int record__open(struct record *rec)
1289 struct evlist *evlist = rec->evlist;
1290 struct perf_session *session = rec->session;
1291 struct record_opts *opts = &rec->opts;
1295 * For initial_delay, system wide or a hybrid system, we need to add a
1296 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1297 * of waiting or event synthesis.
1299 if (opts->initial_delay || target__has_cpu(&opts->target) ||
1300 perf_pmu__has_hybrid()) {
1301 pos = evlist__get_tracking_event(evlist);
1302 if (!evsel__is_dummy_event(pos)) {
1303 /* Set up dummy event. */
1304 if (evlist__add_dummy(evlist))
1306 pos = evlist__last(evlist);
1307 evlist__set_tracking_event(evlist, pos);
1311 * Enable the dummy event when the process is forked for
1312 * initial_delay, immediately for system wide.
1314 if (opts->initial_delay && !pos->immediate &&
1315 !target__has_cpu(&opts->target))
1316 pos->core.attr.enable_on_exec = 1;
1321 evlist__config(evlist, opts, &callchain_param);
1323 evlist__for_each_entry(evlist, pos) {
1325 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1326 if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1328 ui__warning("%s\n", msg);
1331 if ((errno == EINVAL || errno == EBADF) &&
1332 pos->core.leader != &pos->core &&
1334 pos = evlist__reset_weak_group(evlist, pos, true);
1338 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1339 ui__error("%s\n", msg);
1343 pos->supported = true;
1346 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1348 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1349 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1350 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1351 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1352 "Samples in kernel modules won't be resolved at all.\n\n"
1353 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1354 "even with a suitable vmlinux or kallsyms file.\n\n");
1357 if (evlist__apply_filters(evlist, &pos)) {
1358 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1359 pos->filter, evsel__name(pos), errno,
1360 str_error_r(errno, msg, sizeof(msg)));
1365 rc = record__mmap(rec);
1369 session->evlist = evlist;
1370 perf_session__set_id_hdr_size(session);
1375 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1377 if (rec->evlist->first_sample_time == 0)
1378 rec->evlist->first_sample_time = sample_time;
1381 rec->evlist->last_sample_time = sample_time;
1384 static int process_sample_event(struct perf_tool *tool,
1385 union perf_event *event,
1386 struct perf_sample *sample,
1387 struct evsel *evsel,
1388 struct machine *machine)
1390 struct record *rec = container_of(tool, struct record, tool);
1392 set_timestamp_boundary(rec, sample->time);
1394 if (rec->buildid_all)
1398 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1401 static int process_buildids(struct record *rec)
1403 struct perf_session *session = rec->session;
1405 if (perf_data__size(&rec->data) == 0)
1409 * During this process, it'll load kernel map and replace the
1410 * dso->long_name to a real pathname it found. In this case
1411 * we prefer the vmlinux path like
1412 * /lib/modules/3.16.4/build/vmlinux
1414 * rather than build-id path (in debug directory).
1415 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1417 symbol_conf.ignore_vmlinux_buildid = true;
1420 * If --buildid-all is given, it marks all DSO regardless of hits,
1421 * so no need to process samples. But if timestamp_boundary is enabled,
1422 * it still needs to walk on all samples to get the timestamps of
1423 * first/last samples.
1425 if (rec->buildid_all && !rec->timestamp_boundary)
1426 rec->tool.sample = NULL;
1428 return perf_session__process_events(session);
1431 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1434 struct perf_tool *tool = data;
1436 *As for guest kernel when processing subcommand record&report,
1437 *we arrange module mmap prior to guest kernel mmap and trigger
1438 *a preload dso because default guest module symbols are loaded
1439 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1440 *method is used to avoid symbol missing when the first addr is
1441 *in module instead of in guest kernel.
1443 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1446 pr_err("Couldn't record guest kernel [%d]'s reference"
1447 " relocation symbol.\n", machine->pid);
1450 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1451 * have no _text sometimes.
1453 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1456 pr_err("Couldn't record guest kernel [%d]'s reference"
1457 " relocation symbol.\n", machine->pid);
1460 static struct perf_event_header finished_round_event = {
1461 .size = sizeof(struct perf_event_header),
1462 .type = PERF_RECORD_FINISHED_ROUND,
1465 static struct perf_event_header finished_init_event = {
1466 .size = sizeof(struct perf_event_header),
1467 .type = PERF_RECORD_FINISHED_INIT,
1470 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1472 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1473 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1474 thread->mask->affinity.nbits)) {
1475 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1476 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1477 map->affinity_mask.bits, thread->mask->affinity.nbits);
1478 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1479 (cpu_set_t *)thread->mask->affinity.bits);
1481 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1482 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1487 static size_t process_comp_header(void *record, size_t increment)
1489 struct perf_record_compressed *event = record;
1490 size_t size = sizeof(*event);
1493 event->header.size += increment;
1497 event->header.type = PERF_RECORD_COMPRESSED;
1498 event->header.size = size;
1503 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1504 void *dst, size_t dst_size, void *src, size_t src_size)
1507 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1508 struct zstd_data *zstd_data = &session->zstd_data;
1510 if (map && map->file)
1511 zstd_data = &map->zstd_data;
1513 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1514 max_record_size, process_comp_header);
1516 if (map && map->file) {
1517 thread->bytes_transferred += src_size;
1518 thread->bytes_compressed += compressed;
1520 session->bytes_transferred += src_size;
1521 session->bytes_compressed += compressed;
1527 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1528 bool overwrite, bool synch)
1530 u64 bytes_written = rec->bytes_written;
1535 int trace_fd = rec->data.file.fd;
1541 nr_mmaps = thread->nr_mmaps;
1542 maps = overwrite ? thread->overwrite_maps : thread->maps;
1547 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1550 if (record__aio_enabled(rec))
1551 off = record__aio_get_pos(trace_fd);
1553 for (i = 0; i < nr_mmaps; i++) {
1555 struct mmap *map = maps[i];
1557 if (map->core.base) {
1558 record__adjust_affinity(rec, map);
1560 flush = map->core.flush;
1561 map->core.flush = 1;
1563 if (!record__aio_enabled(rec)) {
1564 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1566 map->core.flush = flush;
1571 if (record__aio_push(rec, map, &off) < 0) {
1572 record__aio_set_pos(trace_fd, off);
1574 map->core.flush = flush;
1580 map->core.flush = flush;
1583 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1584 !rec->opts.auxtrace_sample_mode &&
1585 record__auxtrace_mmap_read(rec, map) != 0) {
1591 if (record__aio_enabled(rec))
1592 record__aio_set_pos(trace_fd, off);
1595 * Mark the round finished in case we wrote
1596 * at least one event.
1598 * No need for round events in directory mode,
1599 * because per-cpu maps and files have data
1602 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1603 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1606 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1611 static int record__mmap_read_all(struct record *rec, bool synch)
1615 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1619 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1622 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1623 void *arg __maybe_unused)
1625 struct perf_mmap *map = fda->priv[fd].ptr;
1628 perf_mmap__put(map);
1631 static void *record__thread(void *arg)
1633 enum thread_msg msg = THREAD_MSG__READY;
1634 bool terminate = false;
1635 struct fdarray *pollfd;
1639 thread->tid = gettid();
1641 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1643 pr_warning("threads[%d]: failed to notify on start: %s\n",
1644 thread->tid, strerror(errno));
1646 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1648 pollfd = &thread->pollfd;
1649 ctlfd_pos = thread->ctlfd_pos;
1652 unsigned long long hits = thread->samples;
1654 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1657 if (hits == thread->samples) {
1659 err = fdarray__poll(pollfd, -1);
1661 * Propagate error, only if there's any. Ignore positive
1662 * number of returned events and interrupt error.
1664 if (err > 0 || (err < 0 && errno == EINTR))
1668 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1669 record__thread_munmap_filtered, NULL) == 0)
1673 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1675 close(thread->pipes.msg[0]);
1676 thread->pipes.msg[0] = -1;
1677 pollfd->entries[ctlfd_pos].fd = -1;
1678 pollfd->entries[ctlfd_pos].events = 0;
1681 pollfd->entries[ctlfd_pos].revents = 0;
1683 record__mmap_read_all(thread->rec, true);
1685 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1687 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1688 thread->tid, strerror(errno));
1693 static void record__init_features(struct record *rec)
1695 struct perf_session *session = rec->session;
1698 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1699 perf_header__set_feat(&session->header, feat);
1701 if (rec->no_buildid)
1702 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1704 if (!have_tracepoints(&rec->evlist->core.entries))
1705 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1707 if (!rec->opts.branch_stack)
1708 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1710 if (!rec->opts.full_auxtrace)
1711 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1713 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1714 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1716 if (!rec->opts.use_clockid)
1717 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1719 if (!record__threads_enabled(rec))
1720 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1722 if (!record__comp_enabled(rec))
1723 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1725 perf_header__clear_feat(&session->header, HEADER_STAT);
1729 record__finish_output(struct record *rec)
1732 struct perf_data *data = &rec->data;
1733 int fd = perf_data__fd(data);
1738 rec->session->header.data_size += rec->bytes_written;
1739 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1740 if (record__threads_enabled(rec)) {
1741 for (i = 0; i < data->dir.nr; i++)
1742 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1745 if (!rec->no_buildid) {
1746 process_buildids(rec);
1748 if (rec->buildid_all)
1749 dsos__hit_all(rec->session);
1751 perf_session__write_header(rec->session, rec->evlist, fd, true);
1756 static int record__synthesize_workload(struct record *rec, bool tail)
1759 struct perf_thread_map *thread_map;
1760 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1762 if (rec->opts.tail_synthesize != tail)
1765 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1766 if (thread_map == NULL)
1769 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1770 process_synthesized_event,
1771 &rec->session->machines.host,
1773 rec->opts.sample_address);
1774 perf_thread_map__put(thread_map);
1778 static int write_finished_init(struct record *rec, bool tail)
1780 if (rec->opts.tail_synthesize != tail)
1783 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1786 static int record__synthesize(struct record *rec, bool tail);
1789 record__switch_output(struct record *rec, bool at_exit)
1791 struct perf_data *data = &rec->data;
1795 /* Same Size: "2015122520103046"*/
1796 char timestamp[] = "InvalidTimestamp";
1798 record__aio_mmap_read_sync(rec);
1800 write_finished_init(rec, true);
1802 record__synthesize(rec, true);
1803 if (target__none(&rec->opts.target))
1804 record__synthesize_workload(rec, true);
1807 record__finish_output(rec);
1808 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1810 pr_err("Failed to get current timestamp\n");
1814 fd = perf_data__switch(data, timestamp,
1815 rec->session->header.data_offset,
1816 at_exit, &new_filename);
1817 if (fd >= 0 && !at_exit) {
1818 rec->bytes_written = 0;
1819 rec->session->header.data_size = 0;
1823 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1824 data->path, timestamp);
1826 if (rec->switch_output.num_files) {
1827 int n = rec->switch_output.cur_file + 1;
1829 if (n >= rec->switch_output.num_files)
1831 rec->switch_output.cur_file = n;
1832 if (rec->switch_output.filenames[n]) {
1833 remove(rec->switch_output.filenames[n]);
1834 zfree(&rec->switch_output.filenames[n]);
1836 rec->switch_output.filenames[n] = new_filename;
1841 /* Output tracking events */
1843 record__synthesize(rec, false);
1846 * In 'perf record --switch-output' without -a,
1847 * record__synthesize() in record__switch_output() won't
1848 * generate tracking events because there's no thread_map
1849 * in evlist. Which causes newly created perf.data doesn't
1850 * contain map and comm information.
1851 * Create a fake thread_map and directly call
1852 * perf_event__synthesize_thread_map() for those events.
1854 if (target__none(&rec->opts.target))
1855 record__synthesize_workload(rec, false);
1856 write_finished_init(rec, false);
1861 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1862 struct perf_record_lost_samples *lost,
1863 int cpu_idx, int thread_idx)
1865 struct perf_counts_values count;
1866 struct perf_sample_id *sid;
1867 struct perf_sample sample = {};
1870 if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1871 pr_err("read LOST count failed\n");
1875 if (count.lost == 0)
1878 lost->lost = count.lost;
1879 if (evsel->core.ids) {
1880 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1881 sample.id = sid->id;
1884 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1885 evsel->core.attr.sample_type, &sample);
1886 lost->header.size = sizeof(*lost) + id_hdr_size;
1887 record__write(rec, NULL, lost, lost->header.size);
1890 static void record__read_lost_samples(struct record *rec)
1892 struct perf_session *session = rec->session;
1893 struct perf_record_lost_samples *lost;
1894 struct evsel *evsel;
1896 /* there was an error during record__open */
1897 if (session->evlist == NULL)
1900 lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1902 pr_debug("Memory allocation failed\n");
1906 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1908 evlist__for_each_entry(session->evlist, evsel) {
1909 struct xyarray *xy = evsel->core.sample_id;
1911 if (xy == NULL || evsel->core.fd == NULL)
1913 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1914 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1915 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1919 for (int x = 0; x < xyarray__max_x(xy); x++) {
1920 for (int y = 0; y < xyarray__max_y(xy); y++) {
1921 __record__read_lost_samples(rec, evsel, lost, x, y);
1929 static volatile int workload_exec_errno;
1932 * evlist__prepare_workload will send a SIGUSR1
1933 * if the fork fails, since we asked by setting its
1934 * want_signal to true.
1936 static void workload_exec_failed_signal(int signo __maybe_unused,
1938 void *ucontext __maybe_unused)
1940 workload_exec_errno = info->si_value.sival_int;
1945 static void snapshot_sig_handler(int sig);
1946 static void alarm_sig_handler(int sig);
1948 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1951 if (evlist->mmap && evlist->mmap[0].core.base)
1952 return evlist->mmap[0].core.base;
1953 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1954 return evlist->overwrite_mmap[0].core.base;
1959 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1961 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1967 static int record__synthesize(struct record *rec, bool tail)
1969 struct perf_session *session = rec->session;
1970 struct machine *machine = &session->machines.host;
1971 struct perf_data *data = &rec->data;
1972 struct record_opts *opts = &rec->opts;
1973 struct perf_tool *tool = &rec->tool;
1975 event_op f = process_synthesized_event;
1977 if (rec->opts.tail_synthesize != tail)
1980 if (data->is_pipe) {
1981 err = perf_event__synthesize_for_pipe(tool, session, data,
1982 process_synthesized_event);
1986 rec->bytes_written += err;
1989 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1990 process_synthesized_event, machine);
1994 /* Synthesize id_index before auxtrace_info */
1995 err = perf_event__synthesize_id_index(tool,
1996 process_synthesized_event,
1997 session->evlist, machine);
2001 if (rec->opts.full_auxtrace) {
2002 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2003 session, process_synthesized_event);
2008 if (!evlist__exclude_kernel(rec->evlist)) {
2009 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2011 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2012 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2013 "Check /proc/kallsyms permission or run as root.\n");
2015 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2017 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2018 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019 "Check /proc/modules permission or run as root.\n");
2023 machines__process_guests(&session->machines,
2024 perf_event__synthesize_guest_os, tool);
2027 err = perf_event__synthesize_extra_attr(&rec->tool,
2029 process_synthesized_event,
2034 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2035 process_synthesized_event,
2038 pr_err("Couldn't synthesize thread map.\n");
2042 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2043 process_synthesized_event, NULL);
2045 pr_err("Couldn't synthesize cpu map.\n");
2049 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2052 pr_warning("Couldn't synthesize bpf events.\n");
2056 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2057 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2060 pr_warning("Couldn't synthesize cgroup events.\n");
2065 if (rec->opts.nr_threads_synthesize > 1) {
2066 mutex_init(&synth_lock);
2067 perf_set_multithreaded();
2068 f = process_locked_synthesized_event;
2071 if (rec->opts.synth & PERF_SYNTH_TASK) {
2072 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2074 err = __machine__synthesize_threads(machine, tool, &opts->target,
2075 rec->evlist->core.threads,
2076 f, needs_mmap, opts->sample_address,
2077 rec->opts.nr_threads_synthesize);
2080 if (rec->opts.nr_threads_synthesize > 1) {
2081 perf_set_singlethreaded();
2082 mutex_destroy(&synth_lock);
2089 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2091 struct record *rec = data;
2092 pthread_kill(rec->thread_id, SIGUSR2);
2096 static int record__setup_sb_evlist(struct record *rec)
2098 struct record_opts *opts = &rec->opts;
2100 if (rec->sb_evlist != NULL) {
2102 * We get here if --switch-output-event populated the
2103 * sb_evlist, so associate a callback that will send a SIGUSR2
2104 * to the main thread.
2106 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2107 rec->thread_id = pthread_self();
2109 #ifdef HAVE_LIBBPF_SUPPORT
2110 if (!opts->no_bpf_event) {
2111 if (rec->sb_evlist == NULL) {
2112 rec->sb_evlist = evlist__new();
2114 if (rec->sb_evlist == NULL) {
2115 pr_err("Couldn't create side band evlist.\n.");
2120 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2121 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2126 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2127 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2128 opts->no_bpf_event = true;
2134 static int record__init_clock(struct record *rec)
2136 struct perf_session *session = rec->session;
2137 struct timespec ref_clockid;
2138 struct timeval ref_tod;
2141 if (!rec->opts.use_clockid)
2144 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2145 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2147 session->header.env.clock.clockid = rec->opts.clockid;
2149 if (gettimeofday(&ref_tod, NULL) != 0) {
2150 pr_err("gettimeofday failed, cannot set reference time.\n");
2154 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2155 pr_err("clock_gettime failed, cannot set reference time.\n");
2159 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2160 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2162 session->header.env.clock.tod_ns = ref;
2164 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2165 (u64) ref_clockid.tv_nsec;
2167 session->header.env.clock.clockid_ns = ref;
2171 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2173 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2174 trigger_hit(&auxtrace_snapshot_trigger);
2175 auxtrace_record__snapshot_started = 1;
2176 if (auxtrace_record__snapshot_start(rec->itr))
2177 trigger_error(&auxtrace_snapshot_trigger);
2181 static void record__uniquify_name(struct record *rec)
2184 struct evlist *evlist = rec->evlist;
2188 if (!perf_pmu__has_hybrid())
2191 evlist__for_each_entry(evlist, pos) {
2192 if (!evsel__is_hybrid(pos))
2195 if (strchr(pos->name, '/'))
2198 ret = asprintf(&new_name, "%s/%s/",
2199 pos->pmu_name, pos->name);
2202 pos->name = new_name;
2207 static int record__terminate_thread(struct record_thread *thread_data)
2210 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2211 pid_t tid = thread_data->tid;
2213 close(thread_data->pipes.msg[1]);
2214 thread_data->pipes.msg[1] = -1;
2215 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2217 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2219 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2225 static int record__start_threads(struct record *rec)
2227 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2228 struct record_thread *thread_data = rec->thread_data;
2229 sigset_t full, mask;
2231 pthread_attr_t attrs;
2233 thread = &thread_data[0];
2235 if (!record__threads_enabled(rec))
2239 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2240 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2244 pthread_attr_init(&attrs);
2245 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2247 for (t = 1; t < nr_threads; t++) {
2248 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2250 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2251 pthread_attr_setaffinity_np(&attrs,
2252 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2253 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2255 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2256 for (tt = 1; tt < t; tt++)
2257 record__terminate_thread(&thread_data[t]);
2258 pr_err("Failed to start threads: %s\n", strerror(errno));
2263 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2265 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2266 thread_msg_tags[msg]);
2268 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2269 thread->tid, rec->thread_data[t].tid);
2272 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2273 (cpu_set_t *)thread->mask->affinity.bits);
2275 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2278 pthread_attr_destroy(&attrs);
2280 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2281 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2288 static int record__stop_threads(struct record *rec)
2291 struct record_thread *thread_data = rec->thread_data;
2293 for (t = 1; t < rec->nr_threads; t++)
2294 record__terminate_thread(&thread_data[t]);
2296 for (t = 0; t < rec->nr_threads; t++) {
2297 rec->samples += thread_data[t].samples;
2298 if (!record__threads_enabled(rec))
2300 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2301 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2302 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2303 thread_data[t].samples, thread_data[t].waking);
2304 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2305 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2306 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2308 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2314 static unsigned long record__waking(struct record *rec)
2317 unsigned long waking = 0;
2318 struct record_thread *thread_data = rec->thread_data;
2320 for (t = 0; t < rec->nr_threads; t++)
2321 waking += thread_data[t].waking;
2326 static int __cmd_record(struct record *rec, int argc, const char **argv)
2330 const bool forks = argc > 0;
2331 struct perf_tool *tool = &rec->tool;
2332 struct record_opts *opts = &rec->opts;
2333 struct perf_data *data = &rec->data;
2334 struct perf_session *session;
2335 bool disabled = false, draining = false;
2338 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2340 atexit(record__sig_exit);
2341 signal(SIGCHLD, sig_handler);
2342 signal(SIGINT, sig_handler);
2343 signal(SIGTERM, sig_handler);
2344 signal(SIGSEGV, sigsegv_handler);
2346 if (rec->opts.record_namespaces)
2347 tool->namespace_events = true;
2349 if (rec->opts.record_cgroup) {
2350 #ifdef HAVE_FILE_HANDLE
2351 tool->cgroup_events = true;
2353 pr_err("cgroup tracking is not supported\n");
2358 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2359 signal(SIGUSR2, snapshot_sig_handler);
2360 if (rec->opts.auxtrace_snapshot_mode)
2361 trigger_on(&auxtrace_snapshot_trigger);
2362 if (rec->switch_output.enabled)
2363 trigger_on(&switch_output_trigger);
2365 signal(SIGUSR2, SIG_IGN);
2368 session = perf_session__new(data, tool);
2369 if (IS_ERR(session)) {
2370 pr_err("Perf session creation failed.\n");
2371 return PTR_ERR(session);
2374 if (record__threads_enabled(rec)) {
2375 if (perf_data__is_pipe(&rec->data)) {
2376 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2379 if (rec->opts.full_auxtrace) {
2380 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2385 fd = perf_data__fd(data);
2386 rec->session = session;
2388 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2389 pr_err("Compression initialization failed.\n");
2392 #ifdef HAVE_EVENTFD_SUPPORT
2393 done_fd = eventfd(0, EFD_NONBLOCK);
2395 pr_err("Failed to create wakeup eventfd, error: %m\n");
2397 goto out_delete_session;
2399 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2401 pr_err("Failed to add wakeup eventfd to poll list\n");
2403 goto out_delete_session;
2405 #endif // HAVE_EVENTFD_SUPPORT
2407 session->header.env.comp_type = PERF_COMP_ZSTD;
2408 session->header.env.comp_level = rec->opts.comp_level;
2410 if (rec->opts.kcore &&
2411 !record__kcore_readable(&session->machines.host)) {
2412 pr_err("ERROR: kcore is not readable.\n");
2416 if (record__init_clock(rec))
2419 record__init_features(rec);
2422 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2423 workload_exec_failed_signal);
2425 pr_err("Couldn't run the workload!\n");
2427 goto out_delete_session;
2432 * If we have just single event and are sending data
2433 * through pipe, we need to force the ids allocation,
2434 * because we synthesize event name through the pipe
2435 * and need the id for that.
2437 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2438 rec->opts.sample_id = true;
2440 record__uniquify_name(rec);
2442 /* Debug message used by test scripts */
2443 pr_debug3("perf record opening and mmapping events\n");
2444 if (record__open(rec) != 0) {
2446 goto out_free_threads;
2448 /* Debug message used by test scripts */
2449 pr_debug3("perf record done opening and mmapping events\n");
2450 session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2452 if (rec->opts.kcore) {
2453 err = record__kcore_copy(&session->machines.host, data);
2455 pr_err("ERROR: Failed to copy kcore\n");
2456 goto out_free_threads;
2460 err = bpf__apply_obj_config();
2462 char errbuf[BUFSIZ];
2464 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2465 pr_err("ERROR: Apply config to BPF failed: %s\n",
2467 goto out_free_threads;
2471 * Normally perf_session__new would do this, but it doesn't have the
2474 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2475 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2476 rec->tool.ordered_events = false;
2479 if (!rec->evlist->core.nr_groups)
2480 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2482 if (data->is_pipe) {
2483 err = perf_header__write_pipe(fd);
2485 goto out_free_threads;
2487 err = perf_session__write_header(session, rec->evlist, fd, false);
2489 goto out_free_threads;
2493 if (!rec->no_buildid
2494 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2495 pr_err("Couldn't generate buildids. "
2496 "Use --no-buildid to profile anyway.\n");
2497 goto out_free_threads;
2500 err = record__setup_sb_evlist(rec);
2502 goto out_free_threads;
2504 err = record__synthesize(rec, false);
2506 goto out_free_threads;
2508 if (rec->realtime_prio) {
2509 struct sched_param param;
2511 param.sched_priority = rec->realtime_prio;
2512 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2513 pr_err("Could not set realtime priority.\n");
2515 goto out_free_threads;
2519 if (record__start_threads(rec))
2520 goto out_free_threads;
2523 * When perf is starting the traced process, all the events
2524 * (apart from group members) have enable_on_exec=1 set,
2525 * so don't spoil it by prematurely enabling them.
2527 if (!target__none(&opts->target) && !opts->initial_delay)
2528 evlist__enable(rec->evlist);
2534 struct machine *machine = &session->machines.host;
2535 union perf_event *event;
2538 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2539 if (event == NULL) {
2545 * Some H/W events are generated before COMM event
2546 * which is emitted during exec(), so perf script
2547 * cannot see a correct process name for those events.
2548 * Synthesize COMM event to prevent it.
2550 tgid = perf_event__synthesize_comm(tool, event,
2551 rec->evlist->workload.pid,
2552 process_synthesized_event,
2559 event = malloc(sizeof(event->namespaces) +
2560 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2561 machine->id_hdr_size);
2562 if (event == NULL) {
2568 * Synthesize NAMESPACES event for the command specified.
2570 perf_event__synthesize_namespaces(tool, event,
2571 rec->evlist->workload.pid,
2572 tgid, process_synthesized_event,
2576 evlist__start_workload(rec->evlist);
2579 if (opts->initial_delay) {
2580 pr_info(EVLIST_DISABLED_MSG);
2581 if (opts->initial_delay > 0) {
2582 usleep(opts->initial_delay * USEC_PER_MSEC);
2583 evlist__enable(rec->evlist);
2584 pr_info(EVLIST_ENABLED_MSG);
2588 err = event_enable_timer__start(rec->evlist->eet);
2592 /* Debug message used by test scripts */
2593 pr_debug3("perf record has started\n");
2596 trigger_ready(&auxtrace_snapshot_trigger);
2597 trigger_ready(&switch_output_trigger);
2598 perf_hooks__invoke_record_start();
2601 * Must write FINISHED_INIT so it will be seen after all other
2602 * synthesized user events, but before any regular events.
2604 err = write_finished_init(rec, false);
2609 unsigned long long hits = thread->samples;
2612 * rec->evlist->bkw_mmap_state is possible to be
2613 * BKW_MMAP_EMPTY here: when done == true and
2614 * hits != rec->samples in previous round.
2616 * evlist__toggle_bkw_mmap ensure we never
2617 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2619 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2620 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2622 if (record__mmap_read_all(rec, false) < 0) {
2623 trigger_error(&auxtrace_snapshot_trigger);
2624 trigger_error(&switch_output_trigger);
2629 if (auxtrace_record__snapshot_started) {
2630 auxtrace_record__snapshot_started = 0;
2631 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2632 record__read_auxtrace_snapshot(rec, false);
2633 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2634 pr_err("AUX area tracing snapshot failed\n");
2640 if (trigger_is_hit(&switch_output_trigger)) {
2642 * If switch_output_trigger is hit, the data in
2643 * overwritable ring buffer should have been collected,
2644 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2646 * If SIGUSR2 raise after or during record__mmap_read_all(),
2647 * record__mmap_read_all() didn't collect data from
2648 * overwritable ring buffer. Read again.
2650 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2652 trigger_ready(&switch_output_trigger);
2655 * Reenable events in overwrite ring buffer after
2656 * record__mmap_read_all(): we should have collected
2659 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2662 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2663 record__waking(rec));
2665 fd = record__switch_output(rec, false);
2667 pr_err("Failed to switch to new file\n");
2668 trigger_error(&switch_output_trigger);
2673 /* re-arm the alarm */
2674 if (rec->switch_output.time)
2675 alarm(rec->switch_output.time);
2678 if (hits == thread->samples) {
2679 if (done || draining)
2681 err = fdarray__poll(&thread->pollfd, -1);
2683 * Propagate error, only if there's any. Ignore positive
2684 * number of returned events and interrupt error.
2686 if (err > 0 || (err < 0 && errno == EINTR))
2690 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2691 record__thread_munmap_filtered, NULL) == 0)
2694 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2699 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2701 case EVLIST_CTL_CMD_SNAPSHOT:
2702 hit_auxtrace_snapshot_trigger(rec);
2703 evlist__ctlfd_ack(rec->evlist);
2705 case EVLIST_CTL_CMD_STOP:
2708 case EVLIST_CTL_CMD_ACK:
2709 case EVLIST_CTL_CMD_UNSUPPORTED:
2710 case EVLIST_CTL_CMD_ENABLE:
2711 case EVLIST_CTL_CMD_DISABLE:
2712 case EVLIST_CTL_CMD_EVLIST:
2713 case EVLIST_CTL_CMD_PING:
2719 err = event_enable_timer__process(rec->evlist->eet);
2728 * When perf is starting the traced process, at the end events
2729 * die with the process and we wait for that. Thus no need to
2730 * disable events in this case.
2732 if (done && !disabled && !target__none(&opts->target)) {
2733 trigger_off(&auxtrace_snapshot_trigger);
2734 evlist__disable(rec->evlist);
2739 trigger_off(&auxtrace_snapshot_trigger);
2740 trigger_off(&switch_output_trigger);
2742 if (opts->auxtrace_snapshot_on_exit)
2743 record__auxtrace_snapshot_exit(rec);
2745 if (forks && workload_exec_errno) {
2746 char msg[STRERR_BUFSIZE], strevsels[2048];
2747 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2749 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2751 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2752 strevsels, argv[0], emsg);
2758 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2759 record__waking(rec));
2761 write_finished_init(rec, true);
2763 if (target__none(&rec->opts.target))
2764 record__synthesize_workload(rec, true);
2767 record__stop_threads(rec);
2768 record__mmap_read_all(rec, true);
2770 record__free_thread_data(rec);
2771 evlist__finalize_ctlfd(rec->evlist);
2772 record__aio_mmap_read_sync(rec);
2774 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2775 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2776 session->header.env.comp_ratio = ratio + 0.5;
2782 if (!child_finished)
2783 kill(rec->evlist->workload.pid, SIGTERM);
2789 else if (WIFEXITED(exit_status))
2790 status = WEXITSTATUS(exit_status);
2791 else if (WIFSIGNALED(exit_status))
2792 signr = WTERMSIG(exit_status);
2797 rec->bytes_written += off_cpu_write(rec->session);
2799 record__read_lost_samples(rec);
2800 record__synthesize(rec, true);
2801 /* this will be recalculated during process_buildids() */
2805 if (!rec->timestamp_filename) {
2806 record__finish_output(rec);
2808 fd = record__switch_output(rec, true);
2811 goto out_delete_session;
2816 perf_hooks__invoke_record_end();
2818 if (!err && !quiet) {
2820 const char *postfix = rec->timestamp_filename ?
2821 ".<timestamp>" : "";
2823 if (rec->samples && !rec->opts.full_auxtrace)
2824 scnprintf(samples, sizeof(samples),
2825 " (%" PRIu64 " samples)", rec->samples);
2829 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2830 perf_data__size(data) / 1024.0 / 1024.0,
2831 data->path, postfix, samples);
2833 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2834 rec->session->bytes_transferred / 1024.0 / 1024.0,
2837 fprintf(stderr, " ]\n");
2841 #ifdef HAVE_EVENTFD_SUPPORT
2849 zstd_fini(&session->zstd_data);
2850 perf_session__delete(session);
2852 if (!opts->no_bpf_event)
2853 evlist__stop_sb_thread(rec->sb_evlist);
2857 static void callchain_debug(struct callchain_param *callchain)
2859 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2861 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2863 if (callchain->record_mode == CALLCHAIN_DWARF)
2864 pr_debug("callchain: stack dump size %d\n",
2865 callchain->dump_size);
2868 int record_opts__parse_callchain(struct record_opts *record,
2869 struct callchain_param *callchain,
2870 const char *arg, bool unset)
2873 callchain->enabled = !unset;
2875 /* --no-call-graph */
2877 callchain->record_mode = CALLCHAIN_NONE;
2878 pr_debug("callchain: disabled\n");
2882 ret = parse_callchain_record_opt(arg, callchain);
2884 /* Enable data address sampling for DWARF unwind. */
2885 if (callchain->record_mode == CALLCHAIN_DWARF)
2886 record->sample_address = true;
2887 callchain_debug(callchain);
2893 int record_parse_callchain_opt(const struct option *opt,
2897 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2900 int record_callchain_opt(const struct option *opt,
2901 const char *arg __maybe_unused,
2902 int unset __maybe_unused)
2904 struct callchain_param *callchain = opt->value;
2906 callchain->enabled = true;
2908 if (callchain->record_mode == CALLCHAIN_NONE)
2909 callchain->record_mode = CALLCHAIN_FP;
2911 callchain_debug(callchain);
2915 static int perf_record_config(const char *var, const char *value, void *cb)
2917 struct record *rec = cb;
2919 if (!strcmp(var, "record.build-id")) {
2920 if (!strcmp(value, "cache"))
2921 rec->no_buildid_cache = false;
2922 else if (!strcmp(value, "no-cache"))
2923 rec->no_buildid_cache = true;
2924 else if (!strcmp(value, "skip"))
2925 rec->no_buildid = true;
2926 else if (!strcmp(value, "mmap"))
2927 rec->buildid_mmap = true;
2932 if (!strcmp(var, "record.call-graph")) {
2933 var = "call-graph.record-mode";
2934 return perf_default_config(var, value, cb);
2936 #ifdef HAVE_AIO_SUPPORT
2937 if (!strcmp(var, "record.aio")) {
2938 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2939 if (!rec->opts.nr_cblocks)
2940 rec->opts.nr_cblocks = nr_cblocks_default;
2943 if (!strcmp(var, "record.debuginfod")) {
2944 rec->debuginfod.urls = strdup(value);
2945 if (!rec->debuginfod.urls)
2947 rec->debuginfod.set = true;
2953 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2955 struct record *rec = (struct record *)opt->value;
2957 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2960 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2962 struct record_opts *opts = (struct record_opts *)opt->value;
2967 if (!strcasecmp(str, "node"))
2968 opts->affinity = PERF_AFFINITY_NODE;
2969 else if (!strcasecmp(str, "cpu"))
2970 opts->affinity = PERF_AFFINITY_CPU;
2975 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2977 mask->nbits = nr_bits;
2978 mask->bits = bitmap_zalloc(mask->nbits);
2985 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2987 bitmap_free(mask->bits);
2991 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2995 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2997 mask->affinity.bits = NULL;
3001 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3003 record__mmap_cpu_mask_free(&mask->maps);
3004 mask->maps.bits = NULL;
3010 static void record__thread_mask_free(struct thread_mask *mask)
3012 record__mmap_cpu_mask_free(&mask->maps);
3013 record__mmap_cpu_mask_free(&mask->affinity);
3016 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3019 struct record_opts *opts = opt->value;
3021 if (unset || !str || !strlen(str)) {
3022 opts->threads_spec = THREAD_SPEC__CPU;
3024 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3025 if (s == THREAD_SPEC__USER) {
3026 opts->threads_user_spec = strdup(str);
3027 if (!opts->threads_user_spec)
3029 opts->threads_spec = THREAD_SPEC__USER;
3032 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3033 opts->threads_spec = s;
3039 if (opts->threads_spec == THREAD_SPEC__USER)
3040 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3042 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3047 static int parse_output_max_size(const struct option *opt,
3048 const char *str, int unset)
3050 unsigned long *s = (unsigned long *)opt->value;
3051 static struct parse_tag tags_size[] = {
3052 { .tag = 'B', .mult = 1 },
3053 { .tag = 'K', .mult = 1 << 10 },
3054 { .tag = 'M', .mult = 1 << 20 },
3055 { .tag = 'G', .mult = 1 << 30 },
3065 val = parse_tag_value(str, tags_size);
3066 if (val != (unsigned long) -1) {
3074 static int record__parse_mmap_pages(const struct option *opt,
3076 int unset __maybe_unused)
3078 struct record_opts *opts = opt->value;
3080 unsigned int mmap_pages;
3095 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3098 opts->mmap_pages = mmap_pages;
3106 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3110 opts->auxtrace_mmap_pages = mmap_pages;
3117 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3121 static int parse_control_option(const struct option *opt,
3123 int unset __maybe_unused)
3125 struct record_opts *opts = opt->value;
3127 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3130 static void switch_output_size_warn(struct record *rec)
3132 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3133 struct switch_output *s = &rec->switch_output;
3137 if (s->size < wakeup_size) {
3140 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3141 pr_warning("WARNING: switch-output data size lower than "
3142 "wakeup kernel buffer size (%s) "
3143 "expect bigger perf.data sizes\n", buf);
3147 static int switch_output_setup(struct record *rec)
3149 struct switch_output *s = &rec->switch_output;
3150 static struct parse_tag tags_size[] = {
3151 { .tag = 'B', .mult = 1 },
3152 { .tag = 'K', .mult = 1 << 10 },
3153 { .tag = 'M', .mult = 1 << 20 },
3154 { .tag = 'G', .mult = 1 << 30 },
3157 static struct parse_tag tags_time[] = {
3158 { .tag = 's', .mult = 1 },
3159 { .tag = 'm', .mult = 60 },
3160 { .tag = 'h', .mult = 60*60 },
3161 { .tag = 'd', .mult = 60*60*24 },
3167 * If we're using --switch-output-events, then we imply its
3168 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3169 * thread to its parent.
3171 if (rec->switch_output_event_set) {
3172 if (record__threads_enabled(rec)) {
3173 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3182 if (record__threads_enabled(rec)) {
3183 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3187 if (!strcmp(s->str, "signal")) {
3190 pr_debug("switch-output with SIGUSR2 signal\n");
3194 val = parse_tag_value(s->str, tags_size);
3195 if (val != (unsigned long) -1) {
3197 pr_debug("switch-output with %s size threshold\n", s->str);
3201 val = parse_tag_value(s->str, tags_time);
3202 if (val != (unsigned long) -1) {
3204 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3212 rec->timestamp_filename = true;
3215 if (s->size && !rec->opts.no_buffering)
3216 switch_output_size_warn(rec);
3221 static const char * const __record_usage[] = {
3222 "perf record [<options>] [<command>]",
3223 "perf record [<options>] -- <command> [<options>]",
3226 const char * const *record_usage = __record_usage;
3228 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3229 struct perf_sample *sample, struct machine *machine)
3232 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3233 * no need to add them twice.
3235 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3237 return perf_event__process_mmap(tool, event, sample, machine);
3240 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3241 struct perf_sample *sample, struct machine *machine)
3244 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3245 * no need to add them twice.
3247 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3250 return perf_event__process_mmap2(tool, event, sample, machine);
3253 static int process_timestamp_boundary(struct perf_tool *tool,
3254 union perf_event *event __maybe_unused,
3255 struct perf_sample *sample,
3256 struct machine *machine __maybe_unused)
3258 struct record *rec = container_of(tool, struct record, tool);
3260 set_timestamp_boundary(rec, sample->time);
3264 static int parse_record_synth_option(const struct option *opt,
3266 int unset __maybe_unused)
3268 struct record_opts *opts = opt->value;
3269 char *p = strdup(str);
3274 opts->synth = parse_synth_opt(p);
3277 if (opts->synth < 0) {
3278 pr_err("Invalid synth option: %s\n", str);
3285 * XXX Ideally would be local to cmd_record() and passed to a record__new
3286 * because we need to have access to it in record__exit, that is called
3287 * after cmd_record() exits, but since record_options need to be accessible to
3288 * builtin-script, leave it here.
3290 * At least we don't ouch it in all the other functions here directly.
3292 * Just say no to tons of global variables, sigh.
3294 static struct record record = {
3296 .sample_time = true,
3297 .mmap_pages = UINT_MAX,
3298 .user_freq = UINT_MAX,
3299 .user_interval = ULLONG_MAX,
3303 .default_per_cpu = true,
3305 .mmap_flush = MMAP_FLUSH_DEFAULT,
3306 .nr_threads_synthesize = 1,
3309 .synth = PERF_SYNTH_ALL,
3312 .sample = process_sample_event,
3313 .fork = perf_event__process_fork,
3314 .exit = perf_event__process_exit,
3315 .comm = perf_event__process_comm,
3316 .namespaces = perf_event__process_namespaces,
3317 .mmap = build_id__process_mmap,
3318 .mmap2 = build_id__process_mmap2,
3319 .itrace_start = process_timestamp_boundary,
3320 .aux = process_timestamp_boundary,
3321 .ordered_events = true,
3325 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3326 "\n\t\t\t\tDefault: fp";
3328 static bool dry_run;
3331 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3332 * with it and switch to use the library functions in perf_evlist that came
3333 * from builtin-record.c, i.e. use record_opts,
3334 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3337 static struct option __record_options[] = {
3338 OPT_CALLBACK('e', "event", &record.evlist, "event",
3339 "event selector. use 'perf list' to list available events",
3340 parse_events_option),
3341 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3342 "event filter", parse_filter),
3343 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3344 NULL, "don't record events from perf itself",
3346 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3347 "record events on existing process id"),
3348 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3349 "record events on existing thread id"),
3350 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3351 "collect data with this RT SCHED_FIFO priority"),
3352 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3353 "collect data without buffering"),
3354 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3355 "collect raw sample records from all opened counters"),
3356 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3357 "system-wide collection from all CPUs"),
3358 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3359 "list of cpus to monitor"),
3360 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3361 OPT_STRING('o', "output", &record.data.path, "file",
3362 "output file name"),
3363 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3364 &record.opts.no_inherit_set,
3365 "child tasks do not inherit counters"),
3366 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3367 "synthesize non-sample events at the end of output"),
3368 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3369 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3370 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3371 "Fail if the specified frequency can't be used"),
3372 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3373 "profile at this frequency",
3374 record__parse_freq),
3375 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3376 "number of mmap data pages and AUX area tracing mmap pages",
3377 record__parse_mmap_pages),
3378 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3379 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3380 record__mmap_flush_parse),
3381 OPT_BOOLEAN(0, "group", &record.opts.group,
3382 "put the counters into a counter group"),
3383 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3384 NULL, "enables call-graph recording" ,
3385 &record_callchain_opt),
3386 OPT_CALLBACK(0, "call-graph", &record.opts,
3387 "record_mode[,record_size]", record_callchain_help,
3388 &record_parse_callchain_opt),
3389 OPT_INCR('v', "verbose", &verbose,
3390 "be more verbose (show counter open errors, etc)"),
3391 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3392 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3393 "per thread counts"),
3394 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3395 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3396 "Record the sample physical addresses"),
3397 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3398 "Record the sampled data address data page size"),
3399 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3400 "Record the sampled code address (ip) page size"),
3401 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3402 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3403 "Record the sample identifier"),
3404 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3405 &record.opts.sample_time_set,
3406 "Record the sample timestamps"),
3407 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3408 "Record the sample period"),
3409 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3411 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3412 &record.no_buildid_cache_set,
3413 "do not update the buildid cache"),
3414 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3415 &record.no_buildid_set,
3416 "do not collect buildids in perf.data"),
3417 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3418 "monitor event in cgroup name only",
3420 OPT_CALLBACK('D', "delay", &record, "ms",
3421 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3422 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3423 record__parse_event_enable_time),
3424 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3425 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3428 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3429 "branch any", "sample any taken branches",
3430 parse_branch_stack),
3432 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3433 "branch filter mask", "branch stack filter modes",
3434 parse_branch_stack),
3435 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3436 "sample by weight (on special events only)"),
3437 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3438 "sample transaction flags (special events only)"),
3439 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3440 "use per-thread mmaps"),
3441 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3442 "sample selected machine registers on interrupt,"
3443 " use '-I?' to list register names", parse_intr_regs),
3444 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3445 "sample selected machine registers on interrupt,"
3446 " use '--user-regs=?' to list register names", parse_user_regs),
3447 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3448 "Record running/enabled time of read (:S) events"),
3449 OPT_CALLBACK('k', "clockid", &record.opts,
3450 "clockid", "clockid to use for events, see clock_gettime()",
3452 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3453 "opts", "AUX area tracing Snapshot Mode", ""),
3454 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3455 "opts", "sample AUX area", ""),
3456 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3457 "per thread proc mmap processing timeout in ms"),
3458 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3459 "Record namespaces events"),
3460 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3461 "Record cgroup events"),
3462 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3463 &record.opts.record_switch_events_set,
3464 "Record context switch events"),
3465 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3466 "Configure all used events to run in kernel space.",
3467 PARSE_OPT_EXCLUSIVE),
3468 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3469 "Configure all used events to run in user space.",
3470 PARSE_OPT_EXCLUSIVE),
3471 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3472 "collect kernel callchains"),
3473 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3474 "collect user callchains"),
3475 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3476 "clang binary to use for compiling BPF scriptlets"),
3477 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3478 "options passed to clang when compiling BPF scriptlets"),
3479 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3480 "file", "vmlinux pathname"),
3481 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3482 "Record build-id of all DSOs regardless of hits"),
3483 OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3484 "Record build-id in map events"),
3485 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3486 "append timestamp to output filename"),
3487 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3488 "Record timestamp boundary (time of first/last samples)"),
3489 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3490 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3491 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3493 OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3494 "switch output event selector. use 'perf list' to list available events",
3495 parse_events_option_new_evlist),
3496 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3497 "Limit number of switch output generated files"),
3498 OPT_BOOLEAN(0, "dry-run", &dry_run,
3499 "Parse options then exit"),
3500 #ifdef HAVE_AIO_SUPPORT
3501 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3502 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3505 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3506 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3507 record__parse_affinity),
3508 #ifdef HAVE_ZSTD_SUPPORT
3509 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3510 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3511 record__parse_comp_level),
3513 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3514 "size", "Limit the maximum size of the output file", parse_output_max_size),
3515 OPT_UINTEGER(0, "num-thread-synthesize",
3516 &record.opts.nr_threads_synthesize,
3517 "number of threads to run for event synthesis"),
3519 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3520 "libpfm4 event selector. use 'perf list' to list available events",
3521 parse_libpfm_events_option),
3523 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3524 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3525 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3526 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3527 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3528 parse_control_option),
3529 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3530 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3531 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3532 &record.debuginfod.set, "debuginfod urls",
3533 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3535 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3536 "write collected trace data into several data files using parallel threads",
3537 record__parse_threads),
3538 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3542 struct option *record_options = __record_options;
3544 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3546 struct perf_cpu cpu;
3549 if (cpu_map__is_dummy(cpus))
3552 perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3555 /* Return ENODEV is input cpu is greater than max cpu */
3556 if ((unsigned long)cpu.cpu > mask->nbits)
3558 set_bit(cpu.cpu, mask->bits);
3564 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3566 struct perf_cpu_map *cpus;
3568 cpus = perf_cpu_map__new(mask_spec);
3572 bitmap_zero(mask->bits, mask->nbits);
3573 if (record__mmap_cpu_mask_init(mask, cpus))
3576 perf_cpu_map__put(cpus);
3581 static void record__free_thread_masks(struct record *rec, int nr_threads)
3585 if (rec->thread_masks)
3586 for (t = 0; t < nr_threads; t++)
3587 record__thread_mask_free(&rec->thread_masks[t]);
3589 zfree(&rec->thread_masks);
3592 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3596 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3597 if (!rec->thread_masks) {
3598 pr_err("Failed to allocate thread masks\n");
3602 for (t = 0; t < nr_threads; t++) {
3603 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3605 pr_err("Failed to allocate thread masks[%d]\n", t);
3613 record__free_thread_masks(rec, nr_threads);
3618 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3620 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3622 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3626 rec->nr_threads = nr_cpus;
3627 pr_debug("nr_threads: %d\n", rec->nr_threads);
3629 for (t = 0; t < rec->nr_threads; t++) {
3630 set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3631 set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3633 pr_debug("thread_masks[%d]: ", t);
3634 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3635 pr_debug("thread_masks[%d]: ", t);
3636 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3643 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3644 const char **maps_spec, const char **affinity_spec,
3649 struct mmap_cpu_mask cpus_mask;
3650 struct thread_mask thread_mask, full_mask, *thread_masks;
3652 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3654 pr_err("Failed to allocate CPUs mask\n");
3658 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3660 pr_err("Failed to init cpu mask\n");
3661 goto out_free_cpu_mask;
3664 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3666 pr_err("Failed to allocate full mask\n");
3667 goto out_free_cpu_mask;
3670 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3672 pr_err("Failed to allocate thread mask\n");
3673 goto out_free_full_and_cpu_masks;
3676 for (s = 0; s < nr_spec; s++) {
3677 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3679 pr_err("Failed to initialize maps thread mask\n");
3682 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3684 pr_err("Failed to initialize affinity thread mask\n");
3688 /* ignore invalid CPUs but do not allow empty masks */
3689 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3690 cpus_mask.bits, thread_mask.maps.nbits)) {
3691 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3695 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3696 cpus_mask.bits, thread_mask.affinity.nbits)) {
3697 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3702 /* do not allow intersection with other masks (full_mask) */
3703 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3704 thread_mask.maps.nbits)) {
3705 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3709 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3710 thread_mask.affinity.nbits)) {
3711 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3716 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3717 thread_mask.maps.bits, full_mask.maps.nbits);
3718 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3719 thread_mask.affinity.bits, full_mask.maps.nbits);
3721 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3722 if (!thread_masks) {
3723 pr_err("Failed to reallocate thread masks\n");
3727 rec->thread_masks = thread_masks;
3728 rec->thread_masks[t] = thread_mask;
3730 pr_debug("thread_masks[%d]: ", t);
3731 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3732 pr_debug("thread_masks[%d]: ", t);
3733 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3736 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3738 pr_err("Failed to allocate thread mask\n");
3739 goto out_free_full_and_cpu_masks;
3742 rec->nr_threads = t;
3743 pr_debug("nr_threads: %d\n", rec->nr_threads);
3744 if (!rec->nr_threads)
3748 record__thread_mask_free(&thread_mask);
3749 out_free_full_and_cpu_masks:
3750 record__thread_mask_free(&full_mask);
3752 record__mmap_cpu_mask_free(&cpus_mask);
3757 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3760 struct cpu_topology *topo;
3762 topo = cpu_topology__new();
3764 pr_err("Failed to allocate CPU topology\n");
3768 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3769 topo->core_cpus_list, topo->core_cpus_lists);
3770 cpu_topology__delete(topo);
3775 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3778 struct cpu_topology *topo;
3780 topo = cpu_topology__new();
3782 pr_err("Failed to allocate CPU topology\n");
3786 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3787 topo->package_cpus_list, topo->package_cpus_lists);
3788 cpu_topology__delete(topo);
3793 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3798 struct numa_topology *topo;
3800 topo = numa_topology__new();
3802 pr_err("Failed to allocate NUMA topology\n");
3806 spec = zalloc(topo->nr * sizeof(char *));
3808 pr_err("Failed to allocate NUMA spec\n");
3810 goto out_delete_topo;
3812 for (s = 0; s < topo->nr; s++)
3813 spec[s] = topo->nodes[s].cpus;
3815 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3820 numa_topology__delete(topo);
3825 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3829 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3830 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3832 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3833 spec = strtok_r(user_spec, ":", &spec_ptr);
3836 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3837 mask = strtok_r(spec, "/", &mask_ptr);
3840 pr_debug2(" maps mask: %s\n", mask);
3841 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3843 pr_err("Failed to reallocate maps spec\n");
3847 maps_spec = tmp_spec;
3848 maps_spec[nr_spec] = dup_mask = strdup(mask);
3849 if (!maps_spec[nr_spec]) {
3850 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3854 mask = strtok_r(NULL, "/", &mask_ptr);
3856 pr_err("Invalid thread maps or affinity specs\n");
3860 pr_debug2(" affinity mask: %s\n", mask);
3861 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3863 pr_err("Failed to reallocate affinity spec\n");
3867 affinity_spec = tmp_spec;
3868 affinity_spec[nr_spec] = strdup(mask);
3869 if (!affinity_spec[nr_spec]) {
3870 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3878 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3879 (const char **)affinity_spec, nr_spec);
3883 for (s = 0; s < nr_spec; s++) {
3887 free(affinity_spec[s]);
3889 free(affinity_spec);
3895 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3899 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3903 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3906 rec->nr_threads = 1;
3911 static int record__init_thread_masks(struct record *rec)
3914 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3916 if (!record__threads_enabled(rec))
3917 return record__init_thread_default_masks(rec, cpus);
3919 if (evlist__per_thread(rec->evlist)) {
3920 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3924 switch (rec->opts.threads_spec) {
3925 case THREAD_SPEC__CPU:
3926 ret = record__init_thread_cpu_masks(rec, cpus);
3928 case THREAD_SPEC__CORE:
3929 ret = record__init_thread_core_masks(rec, cpus);
3931 case THREAD_SPEC__PACKAGE:
3932 ret = record__init_thread_package_masks(rec, cpus);
3934 case THREAD_SPEC__NUMA:
3935 ret = record__init_thread_numa_masks(rec, cpus);
3937 case THREAD_SPEC__USER:
3938 ret = record__init_thread_user_masks(rec, cpus);
3947 int cmd_record(int argc, const char **argv)
3950 struct record *rec = &record;
3951 char errbuf[BUFSIZ];
3953 setlocale(LC_ALL, "");
3955 #ifndef HAVE_LIBBPF_SUPPORT
3956 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3957 set_nobuild('\0', "clang-path", true);
3958 set_nobuild('\0', "clang-opt", true);
3962 #ifndef HAVE_BPF_PROLOGUE
3963 # if !defined (HAVE_DWARF_SUPPORT)
3964 # define REASON "NO_DWARF=1"
3965 # elif !defined (HAVE_LIBBPF_SUPPORT)
3966 # define REASON "NO_LIBBPF=1"
3968 # define REASON "this architecture doesn't support BPF prologue"
3970 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3971 set_nobuild('\0', "vmlinux", true);
3976 #ifndef HAVE_BPF_SKEL
3977 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3978 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3982 rec->opts.affinity = PERF_AFFINITY_SYS;
3984 rec->evlist = evlist__new();
3985 if (rec->evlist == NULL)
3988 err = perf_config(perf_record_config, rec);
3992 argc = parse_options(argc, argv, record_options, record_usage,
3993 PARSE_OPT_STOP_AT_NON_OPTION);
3995 perf_quiet_option();
3997 err = symbol__validate_sym_arguments();
4001 perf_debuginfod_setup(&record.debuginfod);
4003 /* Make system wide (-a) the default target. */
4004 if (!argc && target__none(&rec->opts.target))
4005 rec->opts.target.system_wide = true;
4007 if (nr_cgroups && !rec->opts.target.system_wide) {
4008 usage_with_options_msg(record_usage, record_options,
4009 "cgroup monitoring only available in system-wide mode");
4013 if (rec->buildid_mmap) {
4014 if (!perf_can_record_build_id()) {
4015 pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4019 pr_debug("Enabling build id in mmap2 events.\n");
4020 /* Enable mmap build id synthesizing. */
4021 symbol_conf.buildid_mmap2 = true;
4022 /* Enable perf_event_attr::build_id bit. */
4023 rec->opts.build_id = true;
4024 /* Disable build id cache. */
4025 rec->no_buildid = true;
4028 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4029 pr_err("Kernel has no cgroup sampling support.\n");
4034 if (rec->opts.kcore)
4035 rec->opts.text_poke = true;
4037 if (rec->opts.kcore || record__threads_enabled(rec))
4038 rec->data.is_dir = true;
4040 if (record__threads_enabled(rec)) {
4041 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4042 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4045 if (record__aio_enabled(rec)) {
4046 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4051 if (rec->opts.comp_level != 0) {
4052 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4053 rec->no_buildid = true;
4056 if (rec->opts.record_switch_events &&
4057 !perf_can_record_switch_events()) {
4058 ui__error("kernel does not support recording context switch events\n");
4059 parse_options_usage(record_usage, record_options, "switch-events", 0);
4064 if (switch_output_setup(rec)) {
4065 parse_options_usage(record_usage, record_options, "switch-output", 0);
4070 if (rec->switch_output.time) {
4071 signal(SIGALRM, alarm_sig_handler);
4072 alarm(rec->switch_output.time);
4075 if (rec->switch_output.num_files) {
4076 rec->switch_output.filenames = calloc(sizeof(char *),
4077 rec->switch_output.num_files);
4078 if (!rec->switch_output.filenames) {
4084 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4085 rec->timestamp_filename = false;
4086 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4090 * Allow aliases to facilitate the lookup of symbols for address
4091 * filters. Refer to auxtrace_parse_filters().
4093 symbol_conf.allow_aliases = true;
4097 err = record__auxtrace_init(rec);
4104 err = bpf__setup_stdout(rec->evlist);
4106 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4107 pr_err("ERROR: Setup BPF stdout failed: %s\n",
4114 if (rec->no_buildid_cache || rec->no_buildid) {
4115 disable_buildid_cache();
4116 } else if (rec->switch_output.enabled) {
4118 * In 'perf record --switch-output', disable buildid
4119 * generation by default to reduce data file switching
4120 * overhead. Still generate buildid if they are required
4123 * perf record --switch-output --no-no-buildid \
4124 * --no-no-buildid-cache
4126 * Following code equals to:
4128 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4129 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4130 * disable_buildid_cache();
4132 bool disable = true;
4134 if (rec->no_buildid_set && !rec->no_buildid)
4136 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4139 rec->no_buildid = true;
4140 rec->no_buildid_cache = true;
4141 disable_buildid_cache();
4145 if (record.opts.overwrite)
4146 record.opts.tail_synthesize = true;
4148 if (rec->evlist->core.nr_entries == 0) {
4149 if (perf_pmu__has_hybrid()) {
4150 err = evlist__add_default_hybrid(rec->evlist,
4151 !record.opts.no_samples);
4153 err = __evlist__add_default(rec->evlist,
4154 !record.opts.no_samples);
4158 pr_err("Not enough memory for event selector list\n");
4163 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4164 rec->opts.no_inherit = true;
4166 err = target__validate(&rec->opts.target);
4168 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4169 ui__warning("%s\n", errbuf);
4172 err = target__parse_uid(&rec->opts.target);
4174 int saved_errno = errno;
4176 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4177 ui__error("%s", errbuf);
4183 /* Enable ignoring missing threads when -u/-p option is defined. */
4184 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4186 if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4187 pr_err("failed to use cpu list %s\n",
4188 rec->opts.target.cpu_list);
4192 rec->opts.target.hybrid = perf_pmu__has_hybrid();
4194 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4195 arch__add_leaf_frame_record_opts(&rec->opts);
4198 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4199 if (rec->opts.target.pid != NULL) {
4200 pr_err("Couldn't create thread/CPU maps: %s\n",
4201 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4205 usage_with_options(record_usage, record_options);
4208 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4213 * We take all buildids when the file contains
4214 * AUX area tracing data because we do not decode the
4215 * trace because it would take too long.
4217 if (rec->opts.full_auxtrace)
4218 rec->buildid_all = true;
4220 if (rec->opts.text_poke) {
4221 err = record__config_text_poke(rec->evlist);
4223 pr_err("record__config_text_poke failed, error %d\n", err);
4229 err = record__config_off_cpu(rec);
4231 pr_err("record__config_off_cpu failed, error %d\n", err);
4236 if (record_opts__config(&rec->opts)) {
4241 err = record__init_thread_masks(rec);
4243 pr_err("Failed to initialize parallel data streaming masks\n");
4247 if (rec->opts.nr_cblocks > nr_cblocks_max)
4248 rec->opts.nr_cblocks = nr_cblocks_max;
4249 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4251 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4252 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4254 if (rec->opts.comp_level > comp_level_max)
4255 rec->opts.comp_level = comp_level_max;
4256 pr_debug("comp level: %d\n", rec->opts.comp_level);
4258 err = __cmd_record(&record, argc, argv);
4260 evlist__delete(rec->evlist);
4262 auxtrace_record__free(rec->itr);
4264 record__free_thread_masks(rec, rec->nr_threads);
4265 rec->nr_threads = 0;
4266 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4270 static void snapshot_sig_handler(int sig __maybe_unused)
4272 struct record *rec = &record;
4274 hit_auxtrace_snapshot_trigger(rec);
4276 if (switch_output_signal(rec))
4277 trigger_hit(&switch_output_trigger);
4280 static void alarm_sig_handler(int sig __maybe_unused)
4282 struct record *rec = &record;
4284 if (switch_output_time(rec))
4285 trigger_hit(&switch_output_trigger);