perf stat: Fix handling of --for-each-cgroup with --bpf-counters to match non BPF...
[platform/kernel/linux-starfive.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84
85 struct switch_output {
86         bool             enabled;
87         bool             signal;
88         unsigned long    size;
89         unsigned long    time;
90         const char      *str;
91         bool             set;
92         char             **filenames;
93         int              num_files;
94         int              cur_file;
95 };
96
97 struct thread_mask {
98         struct mmap_cpu_mask    maps;
99         struct mmap_cpu_mask    affinity;
100 };
101
102 struct record_thread {
103         pid_t                   tid;
104         struct thread_mask      *mask;
105         struct {
106                 int             msg[2];
107                 int             ack[2];
108         } pipes;
109         struct fdarray          pollfd;
110         int                     ctlfd_pos;
111         int                     nr_mmaps;
112         struct mmap             **maps;
113         struct mmap             **overwrite_maps;
114         struct record           *rec;
115         unsigned long long      samples;
116         unsigned long           waking;
117         u64                     bytes_written;
118         u64                     bytes_transferred;
119         u64                     bytes_compressed;
120 };
121
122 static __thread struct record_thread *thread;
123
124 enum thread_msg {
125         THREAD_MSG__UNDEFINED = 0,
126         THREAD_MSG__READY,
127         THREAD_MSG__MAX,
128 };
129
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131         "UNDEFINED", "READY"
132 };
133
134 enum thread_spec {
135         THREAD_SPEC__UNDEFINED = 0,
136         THREAD_SPEC__CPU,
137         THREAD_SPEC__CORE,
138         THREAD_SPEC__PACKAGE,
139         THREAD_SPEC__NUMA,
140         THREAD_SPEC__USER,
141         THREAD_SPEC__MAX,
142 };
143
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145         "undefined", "cpu", "core", "package", "numa", "user"
146 };
147
148 struct pollfd_index_map {
149         int evlist_pollfd_index;
150         int thread_pollfd_index;
151 };
152
153 struct record {
154         struct perf_tool        tool;
155         struct record_opts      opts;
156         u64                     bytes_written;
157         struct perf_data        data;
158         struct auxtrace_record  *itr;
159         struct evlist   *evlist;
160         struct perf_session     *session;
161         struct evlist           *sb_evlist;
162         pthread_t               thread_id;
163         int                     realtime_prio;
164         bool                    switch_output_event_set;
165         bool                    no_buildid;
166         bool                    no_buildid_set;
167         bool                    no_buildid_cache;
168         bool                    no_buildid_cache_set;
169         bool                    buildid_all;
170         bool                    buildid_mmap;
171         bool                    timestamp_filename;
172         bool                    timestamp_boundary;
173         bool                    off_cpu;
174         struct switch_output    switch_output;
175         unsigned long long      samples;
176         unsigned long           output_max_size;        /* = 0: unlimited */
177         struct perf_debuginfod  debuginfod;
178         int                     nr_threads;
179         struct thread_mask      *thread_masks;
180         struct record_thread    *thread_data;
181         struct pollfd_index_map *index_map;
182         size_t                  index_map_sz;
183         size_t                  index_map_cnt;
184 };
185
186 static volatile int done;
187
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193         "SYS", "NODE", "CPU"
194 };
195
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199         return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202
203 static int record__threads_enabled(struct record *rec)
204 {
205         return rec->opts.threads_spec;
206 }
207
208 static bool switch_output_signal(struct record *rec)
209 {
210         return rec->switch_output.signal &&
211                trigger_is_ready(&switch_output_trigger);
212 }
213
214 static bool switch_output_size(struct record *rec)
215 {
216         return rec->switch_output.size &&
217                trigger_is_ready(&switch_output_trigger) &&
218                (rec->bytes_written >= rec->switch_output.size);
219 }
220
221 static bool switch_output_time(struct record *rec)
222 {
223         return rec->switch_output.time &&
224                trigger_is_ready(&switch_output_trigger);
225 }
226
227 static u64 record__bytes_written(struct record *rec)
228 {
229         int t;
230         u64 bytes_written = rec->bytes_written;
231         struct record_thread *thread_data = rec->thread_data;
232
233         for (t = 0; t < rec->nr_threads; t++)
234                 bytes_written += thread_data[t].bytes_written;
235
236         return bytes_written;
237 }
238
239 static bool record__output_max_size_exceeded(struct record *rec)
240 {
241         return rec->output_max_size &&
242                (record__bytes_written(rec) >= rec->output_max_size);
243 }
244
245 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
246                          void *bf, size_t size)
247 {
248         struct perf_data_file *file = &rec->session->data->file;
249
250         if (map && map->file)
251                 file = map->file;
252
253         if (perf_data_file__write(file, bf, size) < 0) {
254                 pr_err("failed to write perf data, error: %m\n");
255                 return -1;
256         }
257
258         if (map && map->file)
259                 thread->bytes_written += size;
260         else
261                 rec->bytes_written += size;
262
263         if (record__output_max_size_exceeded(rec) && !done) {
264                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
265                                 " stopping session ]\n",
266                                 record__bytes_written(rec) >> 10);
267                 done = 1;
268         }
269
270         if (switch_output_size(rec))
271                 trigger_hit(&switch_output_trigger);
272
273         return 0;
274 }
275
276 static int record__aio_enabled(struct record *rec);
277 static int record__comp_enabled(struct record *rec);
278 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
279                             void *dst, size_t dst_size, void *src, size_t src_size);
280
281 #ifdef HAVE_AIO_SUPPORT
282 static int record__aio_write(struct aiocb *cblock, int trace_fd,
283                 void *buf, size_t size, off_t off)
284 {
285         int rc;
286
287         cblock->aio_fildes = trace_fd;
288         cblock->aio_buf    = buf;
289         cblock->aio_nbytes = size;
290         cblock->aio_offset = off;
291         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
292
293         do {
294                 rc = aio_write(cblock);
295                 if (rc == 0) {
296                         break;
297                 } else if (errno != EAGAIN) {
298                         cblock->aio_fildes = -1;
299                         pr_err("failed to queue perf data, error: %m\n");
300                         break;
301                 }
302         } while (1);
303
304         return rc;
305 }
306
307 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
308 {
309         void *rem_buf;
310         off_t rem_off;
311         size_t rem_size;
312         int rc, aio_errno;
313         ssize_t aio_ret, written;
314
315         aio_errno = aio_error(cblock);
316         if (aio_errno == EINPROGRESS)
317                 return 0;
318
319         written = aio_ret = aio_return(cblock);
320         if (aio_ret < 0) {
321                 if (aio_errno != EINTR)
322                         pr_err("failed to write perf data, error: %m\n");
323                 written = 0;
324         }
325
326         rem_size = cblock->aio_nbytes - written;
327
328         if (rem_size == 0) {
329                 cblock->aio_fildes = -1;
330                 /*
331                  * md->refcount is incremented in record__aio_pushfn() for
332                  * every aio write request started in record__aio_push() so
333                  * decrement it because the request is now complete.
334                  */
335                 perf_mmap__put(&md->core);
336                 rc = 1;
337         } else {
338                 /*
339                  * aio write request may require restart with the
340                  * reminder if the kernel didn't write whole
341                  * chunk at once.
342                  */
343                 rem_off = cblock->aio_offset + written;
344                 rem_buf = (void *)(cblock->aio_buf + written);
345                 record__aio_write(cblock, cblock->aio_fildes,
346                                 rem_buf, rem_size, rem_off);
347                 rc = 0;
348         }
349
350         return rc;
351 }
352
353 static int record__aio_sync(struct mmap *md, bool sync_all)
354 {
355         struct aiocb **aiocb = md->aio.aiocb;
356         struct aiocb *cblocks = md->aio.cblocks;
357         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
358         int i, do_suspend;
359
360         do {
361                 do_suspend = 0;
362                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
363                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
364                                 if (sync_all)
365                                         aiocb[i] = NULL;
366                                 else
367                                         return i;
368                         } else {
369                                 /*
370                                  * Started aio write is not complete yet
371                                  * so it has to be waited before the
372                                  * next allocation.
373                                  */
374                                 aiocb[i] = &cblocks[i];
375                                 do_suspend = 1;
376                         }
377                 }
378                 if (!do_suspend)
379                         return -1;
380
381                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
382                         if (!(errno == EAGAIN || errno == EINTR))
383                                 pr_err("failed to sync perf data, error: %m\n");
384                 }
385         } while (1);
386 }
387
388 struct record_aio {
389         struct record   *rec;
390         void            *data;
391         size_t          size;
392 };
393
394 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
395 {
396         struct record_aio *aio = to;
397
398         /*
399          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
400          * to release space in the kernel buffer as fast as possible, calling
401          * perf_mmap__consume() from perf_mmap__push() function.
402          *
403          * That lets the kernel to proceed with storing more profiling data into
404          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
405          *
406          * Coping can be done in two steps in case the chunk of profiling data
407          * crosses the upper bound of the kernel buffer. In this case we first move
408          * part of data from map->start till the upper bound and then the reminder
409          * from the beginning of the kernel buffer till the end of the data chunk.
410          */
411
412         if (record__comp_enabled(aio->rec)) {
413                 size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
414                                      mmap__mmap_len(map) - aio->size,
415                                      buf, size);
416         } else {
417                 memcpy(aio->data + aio->size, buf, size);
418         }
419
420         if (!aio->size) {
421                 /*
422                  * Increment map->refcount to guard map->aio.data[] buffer
423                  * from premature deallocation because map object can be
424                  * released earlier than aio write request started on
425                  * map->aio.data[] buffer is complete.
426                  *
427                  * perf_mmap__put() is done at record__aio_complete()
428                  * after started aio request completion or at record__aio_push()
429                  * if the request failed to start.
430                  */
431                 perf_mmap__get(&map->core);
432         }
433
434         aio->size += size;
435
436         return size;
437 }
438
439 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
440 {
441         int ret, idx;
442         int trace_fd = rec->session->data->file.fd;
443         struct record_aio aio = { .rec = rec, .size = 0 };
444
445         /*
446          * Call record__aio_sync() to wait till map->aio.data[] buffer
447          * becomes available after previous aio write operation.
448          */
449
450         idx = record__aio_sync(map, false);
451         aio.data = map->aio.data[idx];
452         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
453         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
454                 return ret;
455
456         rec->samples++;
457         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
458         if (!ret) {
459                 *off += aio.size;
460                 rec->bytes_written += aio.size;
461                 if (switch_output_size(rec))
462                         trigger_hit(&switch_output_trigger);
463         } else {
464                 /*
465                  * Decrement map->refcount incremented in record__aio_pushfn()
466                  * back if record__aio_write() operation failed to start, otherwise
467                  * map->refcount is decremented in record__aio_complete() after
468                  * aio write operation finishes successfully.
469                  */
470                 perf_mmap__put(&map->core);
471         }
472
473         return ret;
474 }
475
476 static off_t record__aio_get_pos(int trace_fd)
477 {
478         return lseek(trace_fd, 0, SEEK_CUR);
479 }
480
481 static void record__aio_set_pos(int trace_fd, off_t pos)
482 {
483         lseek(trace_fd, pos, SEEK_SET);
484 }
485
486 static void record__aio_mmap_read_sync(struct record *rec)
487 {
488         int i;
489         struct evlist *evlist = rec->evlist;
490         struct mmap *maps = evlist->mmap;
491
492         if (!record__aio_enabled(rec))
493                 return;
494
495         for (i = 0; i < evlist->core.nr_mmaps; i++) {
496                 struct mmap *map = &maps[i];
497
498                 if (map->core.base)
499                         record__aio_sync(map, true);
500         }
501 }
502
503 static int nr_cblocks_default = 1;
504 static int nr_cblocks_max = 4;
505
506 static int record__aio_parse(const struct option *opt,
507                              const char *str,
508                              int unset)
509 {
510         struct record_opts *opts = (struct record_opts *)opt->value;
511
512         if (unset) {
513                 opts->nr_cblocks = 0;
514         } else {
515                 if (str)
516                         opts->nr_cblocks = strtol(str, NULL, 0);
517                 if (!opts->nr_cblocks)
518                         opts->nr_cblocks = nr_cblocks_default;
519         }
520
521         return 0;
522 }
523 #else /* HAVE_AIO_SUPPORT */
524 static int nr_cblocks_max = 0;
525
526 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
527                             off_t *off __maybe_unused)
528 {
529         return -1;
530 }
531
532 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
533 {
534         return -1;
535 }
536
537 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
538 {
539 }
540
541 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
542 {
543 }
544 #endif
545
546 static int record__aio_enabled(struct record *rec)
547 {
548         return rec->opts.nr_cblocks > 0;
549 }
550
551 #define MMAP_FLUSH_DEFAULT 1
552 static int record__mmap_flush_parse(const struct option *opt,
553                                     const char *str,
554                                     int unset)
555 {
556         int flush_max;
557         struct record_opts *opts = (struct record_opts *)opt->value;
558         static struct parse_tag tags[] = {
559                         { .tag  = 'B', .mult = 1       },
560                         { .tag  = 'K', .mult = 1 << 10 },
561                         { .tag  = 'M', .mult = 1 << 20 },
562                         { .tag  = 'G', .mult = 1 << 30 },
563                         { .tag  = 0 },
564         };
565
566         if (unset)
567                 return 0;
568
569         if (str) {
570                 opts->mmap_flush = parse_tag_value(str, tags);
571                 if (opts->mmap_flush == (int)-1)
572                         opts->mmap_flush = strtol(str, NULL, 0);
573         }
574
575         if (!opts->mmap_flush)
576                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
577
578         flush_max = evlist__mmap_size(opts->mmap_pages);
579         flush_max /= 4;
580         if (opts->mmap_flush > flush_max)
581                 opts->mmap_flush = flush_max;
582
583         return 0;
584 }
585
586 #ifdef HAVE_ZSTD_SUPPORT
587 static unsigned int comp_level_default = 1;
588
589 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
590 {
591         struct record_opts *opts = opt->value;
592
593         if (unset) {
594                 opts->comp_level = 0;
595         } else {
596                 if (str)
597                         opts->comp_level = strtol(str, NULL, 0);
598                 if (!opts->comp_level)
599                         opts->comp_level = comp_level_default;
600         }
601
602         return 0;
603 }
604 #endif
605 static unsigned int comp_level_max = 22;
606
607 static int record__comp_enabled(struct record *rec)
608 {
609         return rec->opts.comp_level > 0;
610 }
611
612 static int process_synthesized_event(struct perf_tool *tool,
613                                      union perf_event *event,
614                                      struct perf_sample *sample __maybe_unused,
615                                      struct machine *machine __maybe_unused)
616 {
617         struct record *rec = container_of(tool, struct record, tool);
618         return record__write(rec, NULL, event, event->header.size);
619 }
620
621 static struct mutex synth_lock;
622
623 static int process_locked_synthesized_event(struct perf_tool *tool,
624                                      union perf_event *event,
625                                      struct perf_sample *sample __maybe_unused,
626                                      struct machine *machine __maybe_unused)
627 {
628         int ret;
629
630         mutex_lock(&synth_lock);
631         ret = process_synthesized_event(tool, event, sample, machine);
632         mutex_unlock(&synth_lock);
633         return ret;
634 }
635
636 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
637 {
638         struct record *rec = to;
639
640         if (record__comp_enabled(rec)) {
641                 size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
642                 bf   = map->data;
643         }
644
645         thread->samples++;
646         return record__write(rec, map, bf, size);
647 }
648
649 static volatile int signr = -1;
650 static volatile int child_finished;
651 #ifdef HAVE_EVENTFD_SUPPORT
652 static volatile int done_fd = -1;
653 #endif
654
655 static void sig_handler(int sig)
656 {
657         if (sig == SIGCHLD)
658                 child_finished = 1;
659         else
660                 signr = sig;
661
662         done = 1;
663 #ifdef HAVE_EVENTFD_SUPPORT
664         if (done_fd >= 0) {
665                 u64 tmp = 1;
666                 int orig_errno = errno;
667
668                 /*
669                  * It is possible for this signal handler to run after done is
670                  * checked in the main loop, but before the perf counter fds are
671                  * polled. If this happens, the poll() will continue to wait
672                  * even though done is set, and will only break out if either
673                  * another signal is received, or the counters are ready for
674                  * read. To ensure the poll() doesn't sleep when done is set,
675                  * use an eventfd (done_fd) to wake up the poll().
676                  */
677                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
678                         pr_err("failed to signal wakeup fd, error: %m\n");
679
680                 errno = orig_errno;
681         }
682 #endif // HAVE_EVENTFD_SUPPORT
683 }
684
685 static void sigsegv_handler(int sig)
686 {
687         perf_hooks__recover();
688         sighandler_dump_stack(sig);
689 }
690
691 static void record__sig_exit(void)
692 {
693         if (signr == -1)
694                 return;
695
696         signal(signr, SIG_DFL);
697         raise(signr);
698 }
699
700 #ifdef HAVE_AUXTRACE_SUPPORT
701
702 static int record__process_auxtrace(struct perf_tool *tool,
703                                     struct mmap *map,
704                                     union perf_event *event, void *data1,
705                                     size_t len1, void *data2, size_t len2)
706 {
707         struct record *rec = container_of(tool, struct record, tool);
708         struct perf_data *data = &rec->data;
709         size_t padding;
710         u8 pad[8] = {0};
711
712         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
713                 off_t file_offset;
714                 int fd = perf_data__fd(data);
715                 int err;
716
717                 file_offset = lseek(fd, 0, SEEK_CUR);
718                 if (file_offset == -1)
719                         return -1;
720                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
721                                                      event, file_offset);
722                 if (err)
723                         return err;
724         }
725
726         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
727         padding = (len1 + len2) & 7;
728         if (padding)
729                 padding = 8 - padding;
730
731         record__write(rec, map, event, event->header.size);
732         record__write(rec, map, data1, len1);
733         if (len2)
734                 record__write(rec, map, data2, len2);
735         record__write(rec, map, &pad, padding);
736
737         return 0;
738 }
739
740 static int record__auxtrace_mmap_read(struct record *rec,
741                                       struct mmap *map)
742 {
743         int ret;
744
745         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
746                                   record__process_auxtrace);
747         if (ret < 0)
748                 return ret;
749
750         if (ret)
751                 rec->samples++;
752
753         return 0;
754 }
755
756 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
757                                                struct mmap *map)
758 {
759         int ret;
760
761         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
762                                            record__process_auxtrace,
763                                            rec->opts.auxtrace_snapshot_size);
764         if (ret < 0)
765                 return ret;
766
767         if (ret)
768                 rec->samples++;
769
770         return 0;
771 }
772
773 static int record__auxtrace_read_snapshot_all(struct record *rec)
774 {
775         int i;
776         int rc = 0;
777
778         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
779                 struct mmap *map = &rec->evlist->mmap[i];
780
781                 if (!map->auxtrace_mmap.base)
782                         continue;
783
784                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
785                         rc = -1;
786                         goto out;
787                 }
788         }
789 out:
790         return rc;
791 }
792
793 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
794 {
795         pr_debug("Recording AUX area tracing snapshot\n");
796         if (record__auxtrace_read_snapshot_all(rec) < 0) {
797                 trigger_error(&auxtrace_snapshot_trigger);
798         } else {
799                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
800                         trigger_error(&auxtrace_snapshot_trigger);
801                 else
802                         trigger_ready(&auxtrace_snapshot_trigger);
803         }
804 }
805
806 static int record__auxtrace_snapshot_exit(struct record *rec)
807 {
808         if (trigger_is_error(&auxtrace_snapshot_trigger))
809                 return 0;
810
811         if (!auxtrace_record__snapshot_started &&
812             auxtrace_record__snapshot_start(rec->itr))
813                 return -1;
814
815         record__read_auxtrace_snapshot(rec, true);
816         if (trigger_is_error(&auxtrace_snapshot_trigger))
817                 return -1;
818
819         return 0;
820 }
821
822 static int record__auxtrace_init(struct record *rec)
823 {
824         int err;
825
826         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
827             && record__threads_enabled(rec)) {
828                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
829                 return -EINVAL;
830         }
831
832         if (!rec->itr) {
833                 rec->itr = auxtrace_record__init(rec->evlist, &err);
834                 if (err)
835                         return err;
836         }
837
838         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
839                                               rec->opts.auxtrace_snapshot_opts);
840         if (err)
841                 return err;
842
843         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
844                                             rec->opts.auxtrace_sample_opts);
845         if (err)
846                 return err;
847
848         auxtrace_regroup_aux_output(rec->evlist);
849
850         return auxtrace_parse_filters(rec->evlist);
851 }
852
853 #else
854
855 static inline
856 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
857                                struct mmap *map __maybe_unused)
858 {
859         return 0;
860 }
861
862 static inline
863 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
864                                     bool on_exit __maybe_unused)
865 {
866 }
867
868 static inline
869 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
870 {
871         return 0;
872 }
873
874 static inline
875 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
876 {
877         return 0;
878 }
879
880 static int record__auxtrace_init(struct record *rec __maybe_unused)
881 {
882         return 0;
883 }
884
885 #endif
886
887 static int record__config_text_poke(struct evlist *evlist)
888 {
889         struct evsel *evsel;
890
891         /* Nothing to do if text poke is already configured */
892         evlist__for_each_entry(evlist, evsel) {
893                 if (evsel->core.attr.text_poke)
894                         return 0;
895         }
896
897         evsel = evlist__add_dummy_on_all_cpus(evlist);
898         if (!evsel)
899                 return -ENOMEM;
900
901         evsel->core.attr.text_poke = 1;
902         evsel->core.attr.ksymbol = 1;
903         evsel->immediate = true;
904         evsel__set_sample_bit(evsel, TIME);
905
906         return 0;
907 }
908
909 static int record__config_off_cpu(struct record *rec)
910 {
911         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
912 }
913
914 static bool record__kcore_readable(struct machine *machine)
915 {
916         char kcore[PATH_MAX];
917         int fd;
918
919         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
920
921         fd = open(kcore, O_RDONLY);
922         if (fd < 0)
923                 return false;
924
925         close(fd);
926
927         return true;
928 }
929
930 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
931 {
932         char from_dir[PATH_MAX];
933         char kcore_dir[PATH_MAX];
934         int ret;
935
936         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
937
938         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
939         if (ret)
940                 return ret;
941
942         return kcore_copy(from_dir, kcore_dir);
943 }
944
945 static void record__thread_data_init_pipes(struct record_thread *thread_data)
946 {
947         thread_data->pipes.msg[0] = -1;
948         thread_data->pipes.msg[1] = -1;
949         thread_data->pipes.ack[0] = -1;
950         thread_data->pipes.ack[1] = -1;
951 }
952
953 static int record__thread_data_open_pipes(struct record_thread *thread_data)
954 {
955         if (pipe(thread_data->pipes.msg))
956                 return -EINVAL;
957
958         if (pipe(thread_data->pipes.ack)) {
959                 close(thread_data->pipes.msg[0]);
960                 thread_data->pipes.msg[0] = -1;
961                 close(thread_data->pipes.msg[1]);
962                 thread_data->pipes.msg[1] = -1;
963                 return -EINVAL;
964         }
965
966         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
967                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
968                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
969
970         return 0;
971 }
972
973 static void record__thread_data_close_pipes(struct record_thread *thread_data)
974 {
975         if (thread_data->pipes.msg[0] != -1) {
976                 close(thread_data->pipes.msg[0]);
977                 thread_data->pipes.msg[0] = -1;
978         }
979         if (thread_data->pipes.msg[1] != -1) {
980                 close(thread_data->pipes.msg[1]);
981                 thread_data->pipes.msg[1] = -1;
982         }
983         if (thread_data->pipes.ack[0] != -1) {
984                 close(thread_data->pipes.ack[0]);
985                 thread_data->pipes.ack[0] = -1;
986         }
987         if (thread_data->pipes.ack[1] != -1) {
988                 close(thread_data->pipes.ack[1]);
989                 thread_data->pipes.ack[1] = -1;
990         }
991 }
992
993 static bool evlist__per_thread(struct evlist *evlist)
994 {
995         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
996 }
997
998 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
999 {
1000         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1001         struct mmap *mmap = evlist->mmap;
1002         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1003         struct perf_cpu_map *cpus = evlist->core.all_cpus;
1004         bool per_thread = evlist__per_thread(evlist);
1005
1006         if (per_thread)
1007                 thread_data->nr_mmaps = nr_mmaps;
1008         else
1009                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1010                                                       thread_data->mask->maps.nbits);
1011         if (mmap) {
1012                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013                 if (!thread_data->maps)
1014                         return -ENOMEM;
1015         }
1016         if (overwrite_mmap) {
1017                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1018                 if (!thread_data->overwrite_maps) {
1019                         zfree(&thread_data->maps);
1020                         return -ENOMEM;
1021                 }
1022         }
1023         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1024                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1025
1026         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1027                 if (per_thread ||
1028                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1029                         if (thread_data->maps) {
1030                                 thread_data->maps[tm] = &mmap[m];
1031                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1032                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033                         }
1034                         if (thread_data->overwrite_maps) {
1035                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1036                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1037                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1038                         }
1039                         tm++;
1040                 }
1041         }
1042
1043         return 0;
1044 }
1045
1046 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1047 {
1048         int f, tm, pos;
1049         struct mmap *map, *overwrite_map;
1050
1051         fdarray__init(&thread_data->pollfd, 64);
1052
1053         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1054                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1055                 overwrite_map = thread_data->overwrite_maps ?
1056                                 thread_data->overwrite_maps[tm] : NULL;
1057
1058                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1059                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1060
1061                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1062                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1063                                                               &evlist->core.pollfd);
1064                                 if (pos < 0)
1065                                         return pos;
1066                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1067                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1068                         }
1069                 }
1070         }
1071
1072         return 0;
1073 }
1074
1075 static void record__free_thread_data(struct record *rec)
1076 {
1077         int t;
1078         struct record_thread *thread_data = rec->thread_data;
1079
1080         if (thread_data == NULL)
1081                 return;
1082
1083         for (t = 0; t < rec->nr_threads; t++) {
1084                 record__thread_data_close_pipes(&thread_data[t]);
1085                 zfree(&thread_data[t].maps);
1086                 zfree(&thread_data[t].overwrite_maps);
1087                 fdarray__exit(&thread_data[t].pollfd);
1088         }
1089
1090         zfree(&rec->thread_data);
1091 }
1092
1093 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1094                                                     int evlist_pollfd_index,
1095                                                     int thread_pollfd_index)
1096 {
1097         size_t x = rec->index_map_cnt;
1098
1099         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1100                 return -ENOMEM;
1101         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1102         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1103         rec->index_map_cnt += 1;
1104         return 0;
1105 }
1106
1107 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1108                                                     struct evlist *evlist,
1109                                                     struct record_thread *thread_data)
1110 {
1111         struct pollfd *e_entries = evlist->core.pollfd.entries;
1112         struct pollfd *t_entries = thread_data->pollfd.entries;
1113         int err = 0;
1114         size_t i;
1115
1116         for (i = 0; i < rec->index_map_cnt; i++) {
1117                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1118                 int t_pos = rec->index_map[i].thread_pollfd_index;
1119
1120                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1121                     e_entries[e_pos].events != t_entries[t_pos].events) {
1122                         pr_err("Thread and evlist pollfd index mismatch\n");
1123                         err = -EINVAL;
1124                         continue;
1125                 }
1126                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1127         }
1128         return err;
1129 }
1130
1131 static int record__dup_non_perf_events(struct record *rec,
1132                                        struct evlist *evlist,
1133                                        struct record_thread *thread_data)
1134 {
1135         struct fdarray *fda = &evlist->core.pollfd;
1136         int i, ret;
1137
1138         for (i = 0; i < fda->nr; i++) {
1139                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1140                         continue;
1141                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1142                 if (ret < 0) {
1143                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1144                         return ret;
1145                 }
1146                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1147                           thread_data, ret, fda->entries[i].fd);
1148                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1149                 if (ret < 0) {
1150                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1151                         return ret;
1152                 }
1153         }
1154         return 0;
1155 }
1156
1157 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1158 {
1159         int t, ret;
1160         struct record_thread *thread_data;
1161
1162         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1163         if (!rec->thread_data) {
1164                 pr_err("Failed to allocate thread data\n");
1165                 return -ENOMEM;
1166         }
1167         thread_data = rec->thread_data;
1168
1169         for (t = 0; t < rec->nr_threads; t++)
1170                 record__thread_data_init_pipes(&thread_data[t]);
1171
1172         for (t = 0; t < rec->nr_threads; t++) {
1173                 thread_data[t].rec = rec;
1174                 thread_data[t].mask = &rec->thread_masks[t];
1175                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1176                 if (ret) {
1177                         pr_err("Failed to initialize thread[%d] maps\n", t);
1178                         goto out_free;
1179                 }
1180                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1181                 if (ret) {
1182                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1183                         goto out_free;
1184                 }
1185                 if (t) {
1186                         thread_data[t].tid = -1;
1187                         ret = record__thread_data_open_pipes(&thread_data[t]);
1188                         if (ret) {
1189                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1190                                 goto out_free;
1191                         }
1192                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1193                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1194                         if (ret < 0) {
1195                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1196                                 goto out_free;
1197                         }
1198                         thread_data[t].ctlfd_pos = ret;
1199                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1200                                  thread_data, thread_data[t].ctlfd_pos,
1201                                  thread_data[t].pipes.msg[0]);
1202                 } else {
1203                         thread_data[t].tid = gettid();
1204
1205                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1206                         if (ret < 0)
1207                                 goto out_free;
1208
1209                         thread_data[t].ctlfd_pos = -1; /* Not used */
1210                 }
1211         }
1212
1213         return 0;
1214
1215 out_free:
1216         record__free_thread_data(rec);
1217
1218         return ret;
1219 }
1220
1221 static int record__mmap_evlist(struct record *rec,
1222                                struct evlist *evlist)
1223 {
1224         int i, ret;
1225         struct record_opts *opts = &rec->opts;
1226         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1227                                   opts->auxtrace_sample_mode;
1228         char msg[512];
1229
1230         if (opts->affinity != PERF_AFFINITY_SYS)
1231                 cpu__setup_cpunode_map();
1232
1233         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1234                                  opts->auxtrace_mmap_pages,
1235                                  auxtrace_overwrite,
1236                                  opts->nr_cblocks, opts->affinity,
1237                                  opts->mmap_flush, opts->comp_level) < 0) {
1238                 if (errno == EPERM) {
1239                         pr_err("Permission error mapping pages.\n"
1240                                "Consider increasing "
1241                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1242                                "or try again with a smaller value of -m/--mmap_pages.\n"
1243                                "(current value: %u,%u)\n",
1244                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1245                         return -errno;
1246                 } else {
1247                         pr_err("failed to mmap with %d (%s)\n", errno,
1248                                 str_error_r(errno, msg, sizeof(msg)));
1249                         if (errno)
1250                                 return -errno;
1251                         else
1252                                 return -EINVAL;
1253                 }
1254         }
1255
1256         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1257                 return -1;
1258
1259         ret = record__alloc_thread_data(rec, evlist);
1260         if (ret)
1261                 return ret;
1262
1263         if (record__threads_enabled(rec)) {
1264                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1265                 if (ret) {
1266                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1267                         return ret;
1268                 }
1269                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1270                         if (evlist->mmap)
1271                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1272                         if (evlist->overwrite_mmap)
1273                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1274                 }
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int record__mmap(struct record *rec)
1281 {
1282         return record__mmap_evlist(rec, rec->evlist);
1283 }
1284
1285 static int record__open(struct record *rec)
1286 {
1287         char msg[BUFSIZ];
1288         struct evsel *pos;
1289         struct evlist *evlist = rec->evlist;
1290         struct perf_session *session = rec->session;
1291         struct record_opts *opts = &rec->opts;
1292         int rc = 0;
1293
1294         /*
1295          * For initial_delay, system wide or a hybrid system, we need to add a
1296          * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1297          * of waiting or event synthesis.
1298          */
1299         if (opts->initial_delay || target__has_cpu(&opts->target) ||
1300             perf_pmu__has_hybrid()) {
1301                 pos = evlist__get_tracking_event(evlist);
1302                 if (!evsel__is_dummy_event(pos)) {
1303                         /* Set up dummy event. */
1304                         if (evlist__add_dummy(evlist))
1305                                 return -ENOMEM;
1306                         pos = evlist__last(evlist);
1307                         evlist__set_tracking_event(evlist, pos);
1308                 }
1309
1310                 /*
1311                  * Enable the dummy event when the process is forked for
1312                  * initial_delay, immediately for system wide.
1313                  */
1314                 if (opts->initial_delay && !pos->immediate &&
1315                     !target__has_cpu(&opts->target))
1316                         pos->core.attr.enable_on_exec = 1;
1317                 else
1318                         pos->immediate = 1;
1319         }
1320
1321         evlist__config(evlist, opts, &callchain_param);
1322
1323         evlist__for_each_entry(evlist, pos) {
1324 try_again:
1325                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1326                         if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1327                                 if (verbose > 0)
1328                                         ui__warning("%s\n", msg);
1329                                 goto try_again;
1330                         }
1331                         if ((errno == EINVAL || errno == EBADF) &&
1332                             pos->core.leader != &pos->core &&
1333                             pos->weak_group) {
1334                                 pos = evlist__reset_weak_group(evlist, pos, true);
1335                                 goto try_again;
1336                         }
1337                         rc = -errno;
1338                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1339                         ui__error("%s\n", msg);
1340                         goto out;
1341                 }
1342
1343                 pos->supported = true;
1344         }
1345
1346         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1347                 pr_warning(
1348 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1349 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1350 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1351 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1352 "Samples in kernel modules won't be resolved at all.\n\n"
1353 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1354 "even with a suitable vmlinux or kallsyms file.\n\n");
1355         }
1356
1357         if (evlist__apply_filters(evlist, &pos)) {
1358                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1359                         pos->filter, evsel__name(pos), errno,
1360                         str_error_r(errno, msg, sizeof(msg)));
1361                 rc = -1;
1362                 goto out;
1363         }
1364
1365         rc = record__mmap(rec);
1366         if (rc)
1367                 goto out;
1368
1369         session->evlist = evlist;
1370         perf_session__set_id_hdr_size(session);
1371 out:
1372         return rc;
1373 }
1374
1375 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1376 {
1377         if (rec->evlist->first_sample_time == 0)
1378                 rec->evlist->first_sample_time = sample_time;
1379
1380         if (sample_time)
1381                 rec->evlist->last_sample_time = sample_time;
1382 }
1383
1384 static int process_sample_event(struct perf_tool *tool,
1385                                 union perf_event *event,
1386                                 struct perf_sample *sample,
1387                                 struct evsel *evsel,
1388                                 struct machine *machine)
1389 {
1390         struct record *rec = container_of(tool, struct record, tool);
1391
1392         set_timestamp_boundary(rec, sample->time);
1393
1394         if (rec->buildid_all)
1395                 return 0;
1396
1397         rec->samples++;
1398         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1399 }
1400
1401 static int process_buildids(struct record *rec)
1402 {
1403         struct perf_session *session = rec->session;
1404
1405         if (perf_data__size(&rec->data) == 0)
1406                 return 0;
1407
1408         /*
1409          * During this process, it'll load kernel map and replace the
1410          * dso->long_name to a real pathname it found.  In this case
1411          * we prefer the vmlinux path like
1412          *   /lib/modules/3.16.4/build/vmlinux
1413          *
1414          * rather than build-id path (in debug directory).
1415          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1416          */
1417         symbol_conf.ignore_vmlinux_buildid = true;
1418
1419         /*
1420          * If --buildid-all is given, it marks all DSO regardless of hits,
1421          * so no need to process samples. But if timestamp_boundary is enabled,
1422          * it still needs to walk on all samples to get the timestamps of
1423          * first/last samples.
1424          */
1425         if (rec->buildid_all && !rec->timestamp_boundary)
1426                 rec->tool.sample = NULL;
1427
1428         return perf_session__process_events(session);
1429 }
1430
1431 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1432 {
1433         int err;
1434         struct perf_tool *tool = data;
1435         /*
1436          *As for guest kernel when processing subcommand record&report,
1437          *we arrange module mmap prior to guest kernel mmap and trigger
1438          *a preload dso because default guest module symbols are loaded
1439          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1440          *method is used to avoid symbol missing when the first addr is
1441          *in module instead of in guest kernel.
1442          */
1443         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1444                                              machine);
1445         if (err < 0)
1446                 pr_err("Couldn't record guest kernel [%d]'s reference"
1447                        " relocation symbol.\n", machine->pid);
1448
1449         /*
1450          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1451          * have no _text sometimes.
1452          */
1453         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1454                                                  machine);
1455         if (err < 0)
1456                 pr_err("Couldn't record guest kernel [%d]'s reference"
1457                        " relocation symbol.\n", machine->pid);
1458 }
1459
1460 static struct perf_event_header finished_round_event = {
1461         .size = sizeof(struct perf_event_header),
1462         .type = PERF_RECORD_FINISHED_ROUND,
1463 };
1464
1465 static struct perf_event_header finished_init_event = {
1466         .size = sizeof(struct perf_event_header),
1467         .type = PERF_RECORD_FINISHED_INIT,
1468 };
1469
1470 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1471 {
1472         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1473             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1474                           thread->mask->affinity.nbits)) {
1475                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1476                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1477                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1478                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1479                                         (cpu_set_t *)thread->mask->affinity.bits);
1480                 if (verbose == 2) {
1481                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1482                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1483                 }
1484         }
1485 }
1486
1487 static size_t process_comp_header(void *record, size_t increment)
1488 {
1489         struct perf_record_compressed *event = record;
1490         size_t size = sizeof(*event);
1491
1492         if (increment) {
1493                 event->header.size += increment;
1494                 return increment;
1495         }
1496
1497         event->header.type = PERF_RECORD_COMPRESSED;
1498         event->header.size = size;
1499
1500         return size;
1501 }
1502
1503 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1504                             void *dst, size_t dst_size, void *src, size_t src_size)
1505 {
1506         size_t compressed;
1507         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1508         struct zstd_data *zstd_data = &session->zstd_data;
1509
1510         if (map && map->file)
1511                 zstd_data = &map->zstd_data;
1512
1513         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1514                                                      max_record_size, process_comp_header);
1515
1516         if (map && map->file) {
1517                 thread->bytes_transferred += src_size;
1518                 thread->bytes_compressed  += compressed;
1519         } else {
1520                 session->bytes_transferred += src_size;
1521                 session->bytes_compressed  += compressed;
1522         }
1523
1524         return compressed;
1525 }
1526
1527 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1528                                     bool overwrite, bool synch)
1529 {
1530         u64 bytes_written = rec->bytes_written;
1531         int i;
1532         int rc = 0;
1533         int nr_mmaps;
1534         struct mmap **maps;
1535         int trace_fd = rec->data.file.fd;
1536         off_t off = 0;
1537
1538         if (!evlist)
1539                 return 0;
1540
1541         nr_mmaps = thread->nr_mmaps;
1542         maps = overwrite ? thread->overwrite_maps : thread->maps;
1543
1544         if (!maps)
1545                 return 0;
1546
1547         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1548                 return 0;
1549
1550         if (record__aio_enabled(rec))
1551                 off = record__aio_get_pos(trace_fd);
1552
1553         for (i = 0; i < nr_mmaps; i++) {
1554                 u64 flush = 0;
1555                 struct mmap *map = maps[i];
1556
1557                 if (map->core.base) {
1558                         record__adjust_affinity(rec, map);
1559                         if (synch) {
1560                                 flush = map->core.flush;
1561                                 map->core.flush = 1;
1562                         }
1563                         if (!record__aio_enabled(rec)) {
1564                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1565                                         if (synch)
1566                                                 map->core.flush = flush;
1567                                         rc = -1;
1568                                         goto out;
1569                                 }
1570                         } else {
1571                                 if (record__aio_push(rec, map, &off) < 0) {
1572                                         record__aio_set_pos(trace_fd, off);
1573                                         if (synch)
1574                                                 map->core.flush = flush;
1575                                         rc = -1;
1576                                         goto out;
1577                                 }
1578                         }
1579                         if (synch)
1580                                 map->core.flush = flush;
1581                 }
1582
1583                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1584                     !rec->opts.auxtrace_sample_mode &&
1585                     record__auxtrace_mmap_read(rec, map) != 0) {
1586                         rc = -1;
1587                         goto out;
1588                 }
1589         }
1590
1591         if (record__aio_enabled(rec))
1592                 record__aio_set_pos(trace_fd, off);
1593
1594         /*
1595          * Mark the round finished in case we wrote
1596          * at least one event.
1597          *
1598          * No need for round events in directory mode,
1599          * because per-cpu maps and files have data
1600          * sorted by kernel.
1601          */
1602         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1603                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1604
1605         if (overwrite)
1606                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1607 out:
1608         return rc;
1609 }
1610
1611 static int record__mmap_read_all(struct record *rec, bool synch)
1612 {
1613         int err;
1614
1615         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1616         if (err)
1617                 return err;
1618
1619         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1620 }
1621
1622 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1623                                            void *arg __maybe_unused)
1624 {
1625         struct perf_mmap *map = fda->priv[fd].ptr;
1626
1627         if (map)
1628                 perf_mmap__put(map);
1629 }
1630
1631 static void *record__thread(void *arg)
1632 {
1633         enum thread_msg msg = THREAD_MSG__READY;
1634         bool terminate = false;
1635         struct fdarray *pollfd;
1636         int err, ctlfd_pos;
1637
1638         thread = arg;
1639         thread->tid = gettid();
1640
1641         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1642         if (err == -1)
1643                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1644                            thread->tid, strerror(errno));
1645
1646         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1647
1648         pollfd = &thread->pollfd;
1649         ctlfd_pos = thread->ctlfd_pos;
1650
1651         for (;;) {
1652                 unsigned long long hits = thread->samples;
1653
1654                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1655                         break;
1656
1657                 if (hits == thread->samples) {
1658
1659                         err = fdarray__poll(pollfd, -1);
1660                         /*
1661                          * Propagate error, only if there's any. Ignore positive
1662                          * number of returned events and interrupt error.
1663                          */
1664                         if (err > 0 || (err < 0 && errno == EINTR))
1665                                 err = 0;
1666                         thread->waking++;
1667
1668                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1669                                             record__thread_munmap_filtered, NULL) == 0)
1670                                 break;
1671                 }
1672
1673                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1674                         terminate = true;
1675                         close(thread->pipes.msg[0]);
1676                         thread->pipes.msg[0] = -1;
1677                         pollfd->entries[ctlfd_pos].fd = -1;
1678                         pollfd->entries[ctlfd_pos].events = 0;
1679                 }
1680
1681                 pollfd->entries[ctlfd_pos].revents = 0;
1682         }
1683         record__mmap_read_all(thread->rec, true);
1684
1685         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1686         if (err == -1)
1687                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1688                            thread->tid, strerror(errno));
1689
1690         return NULL;
1691 }
1692
1693 static void record__init_features(struct record *rec)
1694 {
1695         struct perf_session *session = rec->session;
1696         int feat;
1697
1698         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1699                 perf_header__set_feat(&session->header, feat);
1700
1701         if (rec->no_buildid)
1702                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1703
1704         if (!have_tracepoints(&rec->evlist->core.entries))
1705                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1706
1707         if (!rec->opts.branch_stack)
1708                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1709
1710         if (!rec->opts.full_auxtrace)
1711                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1712
1713         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1714                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1715
1716         if (!rec->opts.use_clockid)
1717                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1718
1719         if (!record__threads_enabled(rec))
1720                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1721
1722         if (!record__comp_enabled(rec))
1723                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1724
1725         perf_header__clear_feat(&session->header, HEADER_STAT);
1726 }
1727
1728 static void
1729 record__finish_output(struct record *rec)
1730 {
1731         int i;
1732         struct perf_data *data = &rec->data;
1733         int fd = perf_data__fd(data);
1734
1735         if (data->is_pipe)
1736                 return;
1737
1738         rec->session->header.data_size += rec->bytes_written;
1739         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1740         if (record__threads_enabled(rec)) {
1741                 for (i = 0; i < data->dir.nr; i++)
1742                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1743         }
1744
1745         if (!rec->no_buildid) {
1746                 process_buildids(rec);
1747
1748                 if (rec->buildid_all)
1749                         dsos__hit_all(rec->session);
1750         }
1751         perf_session__write_header(rec->session, rec->evlist, fd, true);
1752
1753         return;
1754 }
1755
1756 static int record__synthesize_workload(struct record *rec, bool tail)
1757 {
1758         int err;
1759         struct perf_thread_map *thread_map;
1760         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1761
1762         if (rec->opts.tail_synthesize != tail)
1763                 return 0;
1764
1765         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1766         if (thread_map == NULL)
1767                 return -1;
1768
1769         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1770                                                  process_synthesized_event,
1771                                                  &rec->session->machines.host,
1772                                                  needs_mmap,
1773                                                  rec->opts.sample_address);
1774         perf_thread_map__put(thread_map);
1775         return err;
1776 }
1777
1778 static int write_finished_init(struct record *rec, bool tail)
1779 {
1780         if (rec->opts.tail_synthesize != tail)
1781                 return 0;
1782
1783         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1784 }
1785
1786 static int record__synthesize(struct record *rec, bool tail);
1787
1788 static int
1789 record__switch_output(struct record *rec, bool at_exit)
1790 {
1791         struct perf_data *data = &rec->data;
1792         int fd, err;
1793         char *new_filename;
1794
1795         /* Same Size:      "2015122520103046"*/
1796         char timestamp[] = "InvalidTimestamp";
1797
1798         record__aio_mmap_read_sync(rec);
1799
1800         write_finished_init(rec, true);
1801
1802         record__synthesize(rec, true);
1803         if (target__none(&rec->opts.target))
1804                 record__synthesize_workload(rec, true);
1805
1806         rec->samples = 0;
1807         record__finish_output(rec);
1808         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1809         if (err) {
1810                 pr_err("Failed to get current timestamp\n");
1811                 return -EINVAL;
1812         }
1813
1814         fd = perf_data__switch(data, timestamp,
1815                                     rec->session->header.data_offset,
1816                                     at_exit, &new_filename);
1817         if (fd >= 0 && !at_exit) {
1818                 rec->bytes_written = 0;
1819                 rec->session->header.data_size = 0;
1820         }
1821
1822         if (!quiet)
1823                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1824                         data->path, timestamp);
1825
1826         if (rec->switch_output.num_files) {
1827                 int n = rec->switch_output.cur_file + 1;
1828
1829                 if (n >= rec->switch_output.num_files)
1830                         n = 0;
1831                 rec->switch_output.cur_file = n;
1832                 if (rec->switch_output.filenames[n]) {
1833                         remove(rec->switch_output.filenames[n]);
1834                         zfree(&rec->switch_output.filenames[n]);
1835                 }
1836                 rec->switch_output.filenames[n] = new_filename;
1837         } else {
1838                 free(new_filename);
1839         }
1840
1841         /* Output tracking events */
1842         if (!at_exit) {
1843                 record__synthesize(rec, false);
1844
1845                 /*
1846                  * In 'perf record --switch-output' without -a,
1847                  * record__synthesize() in record__switch_output() won't
1848                  * generate tracking events because there's no thread_map
1849                  * in evlist. Which causes newly created perf.data doesn't
1850                  * contain map and comm information.
1851                  * Create a fake thread_map and directly call
1852                  * perf_event__synthesize_thread_map() for those events.
1853                  */
1854                 if (target__none(&rec->opts.target))
1855                         record__synthesize_workload(rec, false);
1856                 write_finished_init(rec, false);
1857         }
1858         return fd;
1859 }
1860
1861 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1862                                         struct perf_record_lost_samples *lost,
1863                                         int cpu_idx, int thread_idx)
1864 {
1865         struct perf_counts_values count;
1866         struct perf_sample_id *sid;
1867         struct perf_sample sample = {};
1868         int id_hdr_size;
1869
1870         if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1871                 pr_err("read LOST count failed\n");
1872                 return;
1873         }
1874
1875         if (count.lost == 0)
1876                 return;
1877
1878         lost->lost = count.lost;
1879         if (evsel->core.ids) {
1880                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1881                 sample.id = sid->id;
1882         }
1883
1884         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1885                                                        evsel->core.attr.sample_type, &sample);
1886         lost->header.size = sizeof(*lost) + id_hdr_size;
1887         record__write(rec, NULL, lost, lost->header.size);
1888 }
1889
1890 static void record__read_lost_samples(struct record *rec)
1891 {
1892         struct perf_session *session = rec->session;
1893         struct perf_record_lost_samples *lost;
1894         struct evsel *evsel;
1895
1896         /* there was an error during record__open */
1897         if (session->evlist == NULL)
1898                 return;
1899
1900         lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1901         if (lost == NULL) {
1902                 pr_debug("Memory allocation failed\n");
1903                 return;
1904         }
1905
1906         lost->header.type = PERF_RECORD_LOST_SAMPLES;
1907
1908         evlist__for_each_entry(session->evlist, evsel) {
1909                 struct xyarray *xy = evsel->core.sample_id;
1910
1911                 if (xy == NULL || evsel->core.fd == NULL)
1912                         continue;
1913                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1914                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1915                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1916                         continue;
1917                 }
1918
1919                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1920                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1921                                 __record__read_lost_samples(rec, evsel, lost, x, y);
1922                         }
1923                 }
1924         }
1925         free(lost);
1926
1927 }
1928
1929 static volatile int workload_exec_errno;
1930
1931 /*
1932  * evlist__prepare_workload will send a SIGUSR1
1933  * if the fork fails, since we asked by setting its
1934  * want_signal to true.
1935  */
1936 static void workload_exec_failed_signal(int signo __maybe_unused,
1937                                         siginfo_t *info,
1938                                         void *ucontext __maybe_unused)
1939 {
1940         workload_exec_errno = info->si_value.sival_int;
1941         done = 1;
1942         child_finished = 1;
1943 }
1944
1945 static void snapshot_sig_handler(int sig);
1946 static void alarm_sig_handler(int sig);
1947
1948 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1949 {
1950         if (evlist) {
1951                 if (evlist->mmap && evlist->mmap[0].core.base)
1952                         return evlist->mmap[0].core.base;
1953                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1954                         return evlist->overwrite_mmap[0].core.base;
1955         }
1956         return NULL;
1957 }
1958
1959 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1960 {
1961         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1962         if (pc)
1963                 return pc;
1964         return NULL;
1965 }
1966
1967 static int record__synthesize(struct record *rec, bool tail)
1968 {
1969         struct perf_session *session = rec->session;
1970         struct machine *machine = &session->machines.host;
1971         struct perf_data *data = &rec->data;
1972         struct record_opts *opts = &rec->opts;
1973         struct perf_tool *tool = &rec->tool;
1974         int err = 0;
1975         event_op f = process_synthesized_event;
1976
1977         if (rec->opts.tail_synthesize != tail)
1978                 return 0;
1979
1980         if (data->is_pipe) {
1981                 err = perf_event__synthesize_for_pipe(tool, session, data,
1982                                                       process_synthesized_event);
1983                 if (err < 0)
1984                         goto out;
1985
1986                 rec->bytes_written += err;
1987         }
1988
1989         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1990                                           process_synthesized_event, machine);
1991         if (err)
1992                 goto out;
1993
1994         /* Synthesize id_index before auxtrace_info */
1995         err = perf_event__synthesize_id_index(tool,
1996                                               process_synthesized_event,
1997                                               session->evlist, machine);
1998         if (err)
1999                 goto out;
2000
2001         if (rec->opts.full_auxtrace) {
2002                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2003                                         session, process_synthesized_event);
2004                 if (err)
2005                         goto out;
2006         }
2007
2008         if (!evlist__exclude_kernel(rec->evlist)) {
2009                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2010                                                          machine);
2011                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2012                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2013                                    "Check /proc/kallsyms permission or run as root.\n");
2014
2015                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2016                                                      machine);
2017                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2018                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019                                    "Check /proc/modules permission or run as root.\n");
2020         }
2021
2022         if (perf_guest) {
2023                 machines__process_guests(&session->machines,
2024                                          perf_event__synthesize_guest_os, tool);
2025         }
2026
2027         err = perf_event__synthesize_extra_attr(&rec->tool,
2028                                                 rec->evlist,
2029                                                 process_synthesized_event,
2030                                                 data->is_pipe);
2031         if (err)
2032                 goto out;
2033
2034         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2035                                                  process_synthesized_event,
2036                                                 NULL);
2037         if (err < 0) {
2038                 pr_err("Couldn't synthesize thread map.\n");
2039                 return err;
2040         }
2041
2042         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2043                                              process_synthesized_event, NULL);
2044         if (err < 0) {
2045                 pr_err("Couldn't synthesize cpu map.\n");
2046                 return err;
2047         }
2048
2049         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2050                                                 machine, opts);
2051         if (err < 0) {
2052                 pr_warning("Couldn't synthesize bpf events.\n");
2053                 err = 0;
2054         }
2055
2056         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2057                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2058                                                      machine);
2059                 if (err < 0) {
2060                         pr_warning("Couldn't synthesize cgroup events.\n");
2061                         err = 0;
2062                 }
2063         }
2064
2065         if (rec->opts.nr_threads_synthesize > 1) {
2066                 mutex_init(&synth_lock);
2067                 perf_set_multithreaded();
2068                 f = process_locked_synthesized_event;
2069         }
2070
2071         if (rec->opts.synth & PERF_SYNTH_TASK) {
2072                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2073
2074                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2075                                                     rec->evlist->core.threads,
2076                                                     f, needs_mmap, opts->sample_address,
2077                                                     rec->opts.nr_threads_synthesize);
2078         }
2079
2080         if (rec->opts.nr_threads_synthesize > 1) {
2081                 perf_set_singlethreaded();
2082                 mutex_destroy(&synth_lock);
2083         }
2084
2085 out:
2086         return err;
2087 }
2088
2089 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2090 {
2091         struct record *rec = data;
2092         pthread_kill(rec->thread_id, SIGUSR2);
2093         return 0;
2094 }
2095
2096 static int record__setup_sb_evlist(struct record *rec)
2097 {
2098         struct record_opts *opts = &rec->opts;
2099
2100         if (rec->sb_evlist != NULL) {
2101                 /*
2102                  * We get here if --switch-output-event populated the
2103                  * sb_evlist, so associate a callback that will send a SIGUSR2
2104                  * to the main thread.
2105                  */
2106                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2107                 rec->thread_id = pthread_self();
2108         }
2109 #ifdef HAVE_LIBBPF_SUPPORT
2110         if (!opts->no_bpf_event) {
2111                 if (rec->sb_evlist == NULL) {
2112                         rec->sb_evlist = evlist__new();
2113
2114                         if (rec->sb_evlist == NULL) {
2115                                 pr_err("Couldn't create side band evlist.\n.");
2116                                 return -1;
2117                         }
2118                 }
2119
2120                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2121                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2122                         return -1;
2123                 }
2124         }
2125 #endif
2126         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2127                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2128                 opts->no_bpf_event = true;
2129         }
2130
2131         return 0;
2132 }
2133
2134 static int record__init_clock(struct record *rec)
2135 {
2136         struct perf_session *session = rec->session;
2137         struct timespec ref_clockid;
2138         struct timeval ref_tod;
2139         u64 ref;
2140
2141         if (!rec->opts.use_clockid)
2142                 return 0;
2143
2144         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2145                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2146
2147         session->header.env.clock.clockid = rec->opts.clockid;
2148
2149         if (gettimeofday(&ref_tod, NULL) != 0) {
2150                 pr_err("gettimeofday failed, cannot set reference time.\n");
2151                 return -1;
2152         }
2153
2154         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2155                 pr_err("clock_gettime failed, cannot set reference time.\n");
2156                 return -1;
2157         }
2158
2159         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2160               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2161
2162         session->header.env.clock.tod_ns = ref;
2163
2164         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2165               (u64) ref_clockid.tv_nsec;
2166
2167         session->header.env.clock.clockid_ns = ref;
2168         return 0;
2169 }
2170
2171 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2172 {
2173         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2174                 trigger_hit(&auxtrace_snapshot_trigger);
2175                 auxtrace_record__snapshot_started = 1;
2176                 if (auxtrace_record__snapshot_start(rec->itr))
2177                         trigger_error(&auxtrace_snapshot_trigger);
2178         }
2179 }
2180
2181 static void record__uniquify_name(struct record *rec)
2182 {
2183         struct evsel *pos;
2184         struct evlist *evlist = rec->evlist;
2185         char *new_name;
2186         int ret;
2187
2188         if (!perf_pmu__has_hybrid())
2189                 return;
2190
2191         evlist__for_each_entry(evlist, pos) {
2192                 if (!evsel__is_hybrid(pos))
2193                         continue;
2194
2195                 if (strchr(pos->name, '/'))
2196                         continue;
2197
2198                 ret = asprintf(&new_name, "%s/%s/",
2199                                pos->pmu_name, pos->name);
2200                 if (ret) {
2201                         free(pos->name);
2202                         pos->name = new_name;
2203                 }
2204         }
2205 }
2206
2207 static int record__terminate_thread(struct record_thread *thread_data)
2208 {
2209         int err;
2210         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2211         pid_t tid = thread_data->tid;
2212
2213         close(thread_data->pipes.msg[1]);
2214         thread_data->pipes.msg[1] = -1;
2215         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2216         if (err > 0)
2217                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2218         else
2219                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2220                            thread->tid, tid);
2221
2222         return 0;
2223 }
2224
2225 static int record__start_threads(struct record *rec)
2226 {
2227         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2228         struct record_thread *thread_data = rec->thread_data;
2229         sigset_t full, mask;
2230         pthread_t handle;
2231         pthread_attr_t attrs;
2232
2233         thread = &thread_data[0];
2234
2235         if (!record__threads_enabled(rec))
2236                 return 0;
2237
2238         sigfillset(&full);
2239         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2240                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2241                 return -1;
2242         }
2243
2244         pthread_attr_init(&attrs);
2245         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2246
2247         for (t = 1; t < nr_threads; t++) {
2248                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2249
2250 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2251                 pthread_attr_setaffinity_np(&attrs,
2252                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2253                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2254 #endif
2255                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2256                         for (tt = 1; tt < t; tt++)
2257                                 record__terminate_thread(&thread_data[t]);
2258                         pr_err("Failed to start threads: %s\n", strerror(errno));
2259                         ret = -1;
2260                         goto out_err;
2261                 }
2262
2263                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2264                 if (err > 0)
2265                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2266                                   thread_msg_tags[msg]);
2267                 else
2268                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2269                                    thread->tid, rec->thread_data[t].tid);
2270         }
2271
2272         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2273                         (cpu_set_t *)thread->mask->affinity.bits);
2274
2275         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2276
2277 out_err:
2278         pthread_attr_destroy(&attrs);
2279
2280         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2281                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2282                 ret = -1;
2283         }
2284
2285         return ret;
2286 }
2287
2288 static int record__stop_threads(struct record *rec)
2289 {
2290         int t;
2291         struct record_thread *thread_data = rec->thread_data;
2292
2293         for (t = 1; t < rec->nr_threads; t++)
2294                 record__terminate_thread(&thread_data[t]);
2295
2296         for (t = 0; t < rec->nr_threads; t++) {
2297                 rec->samples += thread_data[t].samples;
2298                 if (!record__threads_enabled(rec))
2299                         continue;
2300                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2301                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2302                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2303                          thread_data[t].samples, thread_data[t].waking);
2304                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2305                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2306                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2307                 else
2308                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2309         }
2310
2311         return 0;
2312 }
2313
2314 static unsigned long record__waking(struct record *rec)
2315 {
2316         int t;
2317         unsigned long waking = 0;
2318         struct record_thread *thread_data = rec->thread_data;
2319
2320         for (t = 0; t < rec->nr_threads; t++)
2321                 waking += thread_data[t].waking;
2322
2323         return waking;
2324 }
2325
2326 static int __cmd_record(struct record *rec, int argc, const char **argv)
2327 {
2328         int err;
2329         int status = 0;
2330         const bool forks = argc > 0;
2331         struct perf_tool *tool = &rec->tool;
2332         struct record_opts *opts = &rec->opts;
2333         struct perf_data *data = &rec->data;
2334         struct perf_session *session;
2335         bool disabled = false, draining = false;
2336         int fd;
2337         float ratio = 0;
2338         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2339
2340         atexit(record__sig_exit);
2341         signal(SIGCHLD, sig_handler);
2342         signal(SIGINT, sig_handler);
2343         signal(SIGTERM, sig_handler);
2344         signal(SIGSEGV, sigsegv_handler);
2345
2346         if (rec->opts.record_namespaces)
2347                 tool->namespace_events = true;
2348
2349         if (rec->opts.record_cgroup) {
2350 #ifdef HAVE_FILE_HANDLE
2351                 tool->cgroup_events = true;
2352 #else
2353                 pr_err("cgroup tracking is not supported\n");
2354                 return -1;
2355 #endif
2356         }
2357
2358         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2359                 signal(SIGUSR2, snapshot_sig_handler);
2360                 if (rec->opts.auxtrace_snapshot_mode)
2361                         trigger_on(&auxtrace_snapshot_trigger);
2362                 if (rec->switch_output.enabled)
2363                         trigger_on(&switch_output_trigger);
2364         } else {
2365                 signal(SIGUSR2, SIG_IGN);
2366         }
2367
2368         session = perf_session__new(data, tool);
2369         if (IS_ERR(session)) {
2370                 pr_err("Perf session creation failed.\n");
2371                 return PTR_ERR(session);
2372         }
2373
2374         if (record__threads_enabled(rec)) {
2375                 if (perf_data__is_pipe(&rec->data)) {
2376                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2377                         return -1;
2378                 }
2379                 if (rec->opts.full_auxtrace) {
2380                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2381                         return -1;
2382                 }
2383         }
2384
2385         fd = perf_data__fd(data);
2386         rec->session = session;
2387
2388         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2389                 pr_err("Compression initialization failed.\n");
2390                 return -1;
2391         }
2392 #ifdef HAVE_EVENTFD_SUPPORT
2393         done_fd = eventfd(0, EFD_NONBLOCK);
2394         if (done_fd < 0) {
2395                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2396                 status = -1;
2397                 goto out_delete_session;
2398         }
2399         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2400         if (err < 0) {
2401                 pr_err("Failed to add wakeup eventfd to poll list\n");
2402                 status = err;
2403                 goto out_delete_session;
2404         }
2405 #endif // HAVE_EVENTFD_SUPPORT
2406
2407         session->header.env.comp_type  = PERF_COMP_ZSTD;
2408         session->header.env.comp_level = rec->opts.comp_level;
2409
2410         if (rec->opts.kcore &&
2411             !record__kcore_readable(&session->machines.host)) {
2412                 pr_err("ERROR: kcore is not readable.\n");
2413                 return -1;
2414         }
2415
2416         if (record__init_clock(rec))
2417                 return -1;
2418
2419         record__init_features(rec);
2420
2421         if (forks) {
2422                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2423                                                workload_exec_failed_signal);
2424                 if (err < 0) {
2425                         pr_err("Couldn't run the workload!\n");
2426                         status = err;
2427                         goto out_delete_session;
2428                 }
2429         }
2430
2431         /*
2432          * If we have just single event and are sending data
2433          * through pipe, we need to force the ids allocation,
2434          * because we synthesize event name through the pipe
2435          * and need the id for that.
2436          */
2437         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2438                 rec->opts.sample_id = true;
2439
2440         record__uniquify_name(rec);
2441
2442         /* Debug message used by test scripts */
2443         pr_debug3("perf record opening and mmapping events\n");
2444         if (record__open(rec) != 0) {
2445                 err = -1;
2446                 goto out_free_threads;
2447         }
2448         /* Debug message used by test scripts */
2449         pr_debug3("perf record done opening and mmapping events\n");
2450         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2451
2452         if (rec->opts.kcore) {
2453                 err = record__kcore_copy(&session->machines.host, data);
2454                 if (err) {
2455                         pr_err("ERROR: Failed to copy kcore\n");
2456                         goto out_free_threads;
2457                 }
2458         }
2459
2460         err = bpf__apply_obj_config();
2461         if (err) {
2462                 char errbuf[BUFSIZ];
2463
2464                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2465                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2466                          errbuf);
2467                 goto out_free_threads;
2468         }
2469
2470         /*
2471          * Normally perf_session__new would do this, but it doesn't have the
2472          * evlist.
2473          */
2474         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2475                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2476                 rec->tool.ordered_events = false;
2477         }
2478
2479         if (!rec->evlist->core.nr_groups)
2480                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2481
2482         if (data->is_pipe) {
2483                 err = perf_header__write_pipe(fd);
2484                 if (err < 0)
2485                         goto out_free_threads;
2486         } else {
2487                 err = perf_session__write_header(session, rec->evlist, fd, false);
2488                 if (err < 0)
2489                         goto out_free_threads;
2490         }
2491
2492         err = -1;
2493         if (!rec->no_buildid
2494             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2495                 pr_err("Couldn't generate buildids. "
2496                        "Use --no-buildid to profile anyway.\n");
2497                 goto out_free_threads;
2498         }
2499
2500         err = record__setup_sb_evlist(rec);
2501         if (err)
2502                 goto out_free_threads;
2503
2504         err = record__synthesize(rec, false);
2505         if (err < 0)
2506                 goto out_free_threads;
2507
2508         if (rec->realtime_prio) {
2509                 struct sched_param param;
2510
2511                 param.sched_priority = rec->realtime_prio;
2512                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2513                         pr_err("Could not set realtime priority.\n");
2514                         err = -1;
2515                         goto out_free_threads;
2516                 }
2517         }
2518
2519         if (record__start_threads(rec))
2520                 goto out_free_threads;
2521
2522         /*
2523          * When perf is starting the traced process, all the events
2524          * (apart from group members) have enable_on_exec=1 set,
2525          * so don't spoil it by prematurely enabling them.
2526          */
2527         if (!target__none(&opts->target) && !opts->initial_delay)
2528                 evlist__enable(rec->evlist);
2529
2530         /*
2531          * Let the child rip
2532          */
2533         if (forks) {
2534                 struct machine *machine = &session->machines.host;
2535                 union perf_event *event;
2536                 pid_t tgid;
2537
2538                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2539                 if (event == NULL) {
2540                         err = -ENOMEM;
2541                         goto out_child;
2542                 }
2543
2544                 /*
2545                  * Some H/W events are generated before COMM event
2546                  * which is emitted during exec(), so perf script
2547                  * cannot see a correct process name for those events.
2548                  * Synthesize COMM event to prevent it.
2549                  */
2550                 tgid = perf_event__synthesize_comm(tool, event,
2551                                                    rec->evlist->workload.pid,
2552                                                    process_synthesized_event,
2553                                                    machine);
2554                 free(event);
2555
2556                 if (tgid == -1)
2557                         goto out_child;
2558
2559                 event = malloc(sizeof(event->namespaces) +
2560                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2561                                machine->id_hdr_size);
2562                 if (event == NULL) {
2563                         err = -ENOMEM;
2564                         goto out_child;
2565                 }
2566
2567                 /*
2568                  * Synthesize NAMESPACES event for the command specified.
2569                  */
2570                 perf_event__synthesize_namespaces(tool, event,
2571                                                   rec->evlist->workload.pid,
2572                                                   tgid, process_synthesized_event,
2573                                                   machine);
2574                 free(event);
2575
2576                 evlist__start_workload(rec->evlist);
2577         }
2578
2579         if (opts->initial_delay) {
2580                 pr_info(EVLIST_DISABLED_MSG);
2581                 if (opts->initial_delay > 0) {
2582                         usleep(opts->initial_delay * USEC_PER_MSEC);
2583                         evlist__enable(rec->evlist);
2584                         pr_info(EVLIST_ENABLED_MSG);
2585                 }
2586         }
2587
2588         err = event_enable_timer__start(rec->evlist->eet);
2589         if (err)
2590                 goto out_child;
2591
2592         /* Debug message used by test scripts */
2593         pr_debug3("perf record has started\n");
2594         fflush(stderr);
2595
2596         trigger_ready(&auxtrace_snapshot_trigger);
2597         trigger_ready(&switch_output_trigger);
2598         perf_hooks__invoke_record_start();
2599
2600         /*
2601          * Must write FINISHED_INIT so it will be seen after all other
2602          * synthesized user events, but before any regular events.
2603          */
2604         err = write_finished_init(rec, false);
2605         if (err < 0)
2606                 goto out_child;
2607
2608         for (;;) {
2609                 unsigned long long hits = thread->samples;
2610
2611                 /*
2612                  * rec->evlist->bkw_mmap_state is possible to be
2613                  * BKW_MMAP_EMPTY here: when done == true and
2614                  * hits != rec->samples in previous round.
2615                  *
2616                  * evlist__toggle_bkw_mmap ensure we never
2617                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2618                  */
2619                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2620                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2621
2622                 if (record__mmap_read_all(rec, false) < 0) {
2623                         trigger_error(&auxtrace_snapshot_trigger);
2624                         trigger_error(&switch_output_trigger);
2625                         err = -1;
2626                         goto out_child;
2627                 }
2628
2629                 if (auxtrace_record__snapshot_started) {
2630                         auxtrace_record__snapshot_started = 0;
2631                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2632                                 record__read_auxtrace_snapshot(rec, false);
2633                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2634                                 pr_err("AUX area tracing snapshot failed\n");
2635                                 err = -1;
2636                                 goto out_child;
2637                         }
2638                 }
2639
2640                 if (trigger_is_hit(&switch_output_trigger)) {
2641                         /*
2642                          * If switch_output_trigger is hit, the data in
2643                          * overwritable ring buffer should have been collected,
2644                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2645                          *
2646                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2647                          * record__mmap_read_all() didn't collect data from
2648                          * overwritable ring buffer. Read again.
2649                          */
2650                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2651                                 continue;
2652                         trigger_ready(&switch_output_trigger);
2653
2654                         /*
2655                          * Reenable events in overwrite ring buffer after
2656                          * record__mmap_read_all(): we should have collected
2657                          * data from it.
2658                          */
2659                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2660
2661                         if (!quiet)
2662                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2663                                         record__waking(rec));
2664                         thread->waking = 0;
2665                         fd = record__switch_output(rec, false);
2666                         if (fd < 0) {
2667                                 pr_err("Failed to switch to new file\n");
2668                                 trigger_error(&switch_output_trigger);
2669                                 err = fd;
2670                                 goto out_child;
2671                         }
2672
2673                         /* re-arm the alarm */
2674                         if (rec->switch_output.time)
2675                                 alarm(rec->switch_output.time);
2676                 }
2677
2678                 if (hits == thread->samples) {
2679                         if (done || draining)
2680                                 break;
2681                         err = fdarray__poll(&thread->pollfd, -1);
2682                         /*
2683                          * Propagate error, only if there's any. Ignore positive
2684                          * number of returned events and interrupt error.
2685                          */
2686                         if (err > 0 || (err < 0 && errno == EINTR))
2687                                 err = 0;
2688                         thread->waking++;
2689
2690                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2691                                             record__thread_munmap_filtered, NULL) == 0)
2692                                 draining = true;
2693
2694                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2695                         if (err)
2696                                 goto out_child;
2697                 }
2698
2699                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2700                         switch (cmd) {
2701                         case EVLIST_CTL_CMD_SNAPSHOT:
2702                                 hit_auxtrace_snapshot_trigger(rec);
2703                                 evlist__ctlfd_ack(rec->evlist);
2704                                 break;
2705                         case EVLIST_CTL_CMD_STOP:
2706                                 done = 1;
2707                                 break;
2708                         case EVLIST_CTL_CMD_ACK:
2709                         case EVLIST_CTL_CMD_UNSUPPORTED:
2710                         case EVLIST_CTL_CMD_ENABLE:
2711                         case EVLIST_CTL_CMD_DISABLE:
2712                         case EVLIST_CTL_CMD_EVLIST:
2713                         case EVLIST_CTL_CMD_PING:
2714                         default:
2715                                 break;
2716                         }
2717                 }
2718
2719                 err = event_enable_timer__process(rec->evlist->eet);
2720                 if (err < 0)
2721                         goto out_child;
2722                 if (err) {
2723                         err = 0;
2724                         done = 1;
2725                 }
2726
2727                 /*
2728                  * When perf is starting the traced process, at the end events
2729                  * die with the process and we wait for that. Thus no need to
2730                  * disable events in this case.
2731                  */
2732                 if (done && !disabled && !target__none(&opts->target)) {
2733                         trigger_off(&auxtrace_snapshot_trigger);
2734                         evlist__disable(rec->evlist);
2735                         disabled = true;
2736                 }
2737         }
2738
2739         trigger_off(&auxtrace_snapshot_trigger);
2740         trigger_off(&switch_output_trigger);
2741
2742         if (opts->auxtrace_snapshot_on_exit)
2743                 record__auxtrace_snapshot_exit(rec);
2744
2745         if (forks && workload_exec_errno) {
2746                 char msg[STRERR_BUFSIZE], strevsels[2048];
2747                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2748
2749                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2750
2751                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2752                         strevsels, argv[0], emsg);
2753                 err = -1;
2754                 goto out_child;
2755         }
2756
2757         if (!quiet)
2758                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2759                         record__waking(rec));
2760
2761         write_finished_init(rec, true);
2762
2763         if (target__none(&rec->opts.target))
2764                 record__synthesize_workload(rec, true);
2765
2766 out_child:
2767         record__stop_threads(rec);
2768         record__mmap_read_all(rec, true);
2769 out_free_threads:
2770         record__free_thread_data(rec);
2771         evlist__finalize_ctlfd(rec->evlist);
2772         record__aio_mmap_read_sync(rec);
2773
2774         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2775                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2776                 session->header.env.comp_ratio = ratio + 0.5;
2777         }
2778
2779         if (forks) {
2780                 int exit_status;
2781
2782                 if (!child_finished)
2783                         kill(rec->evlist->workload.pid, SIGTERM);
2784
2785                 wait(&exit_status);
2786
2787                 if (err < 0)
2788                         status = err;
2789                 else if (WIFEXITED(exit_status))
2790                         status = WEXITSTATUS(exit_status);
2791                 else if (WIFSIGNALED(exit_status))
2792                         signr = WTERMSIG(exit_status);
2793         } else
2794                 status = err;
2795
2796         if (rec->off_cpu)
2797                 rec->bytes_written += off_cpu_write(rec->session);
2798
2799         record__read_lost_samples(rec);
2800         record__synthesize(rec, true);
2801         /* this will be recalculated during process_buildids() */
2802         rec->samples = 0;
2803
2804         if (!err) {
2805                 if (!rec->timestamp_filename) {
2806                         record__finish_output(rec);
2807                 } else {
2808                         fd = record__switch_output(rec, true);
2809                         if (fd < 0) {
2810                                 status = fd;
2811                                 goto out_delete_session;
2812                         }
2813                 }
2814         }
2815
2816         perf_hooks__invoke_record_end();
2817
2818         if (!err && !quiet) {
2819                 char samples[128];
2820                 const char *postfix = rec->timestamp_filename ?
2821                                         ".<timestamp>" : "";
2822
2823                 if (rec->samples && !rec->opts.full_auxtrace)
2824                         scnprintf(samples, sizeof(samples),
2825                                   " (%" PRIu64 " samples)", rec->samples);
2826                 else
2827                         samples[0] = '\0';
2828
2829                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2830                         perf_data__size(data) / 1024.0 / 1024.0,
2831                         data->path, postfix, samples);
2832                 if (ratio) {
2833                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2834                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2835                                         ratio);
2836                 }
2837                 fprintf(stderr, " ]\n");
2838         }
2839
2840 out_delete_session:
2841 #ifdef HAVE_EVENTFD_SUPPORT
2842         if (done_fd >= 0) {
2843                 fd = done_fd;
2844                 done_fd = -1;
2845
2846                 close(fd);
2847         }
2848 #endif
2849         zstd_fini(&session->zstd_data);
2850         perf_session__delete(session);
2851
2852         if (!opts->no_bpf_event)
2853                 evlist__stop_sb_thread(rec->sb_evlist);
2854         return status;
2855 }
2856
2857 static void callchain_debug(struct callchain_param *callchain)
2858 {
2859         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2860
2861         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2862
2863         if (callchain->record_mode == CALLCHAIN_DWARF)
2864                 pr_debug("callchain: stack dump size %d\n",
2865                          callchain->dump_size);
2866 }
2867
2868 int record_opts__parse_callchain(struct record_opts *record,
2869                                  struct callchain_param *callchain,
2870                                  const char *arg, bool unset)
2871 {
2872         int ret;
2873         callchain->enabled = !unset;
2874
2875         /* --no-call-graph */
2876         if (unset) {
2877                 callchain->record_mode = CALLCHAIN_NONE;
2878                 pr_debug("callchain: disabled\n");
2879                 return 0;
2880         }
2881
2882         ret = parse_callchain_record_opt(arg, callchain);
2883         if (!ret) {
2884                 /* Enable data address sampling for DWARF unwind. */
2885                 if (callchain->record_mode == CALLCHAIN_DWARF)
2886                         record->sample_address = true;
2887                 callchain_debug(callchain);
2888         }
2889
2890         return ret;
2891 }
2892
2893 int record_parse_callchain_opt(const struct option *opt,
2894                                const char *arg,
2895                                int unset)
2896 {
2897         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2898 }
2899
2900 int record_callchain_opt(const struct option *opt,
2901                          const char *arg __maybe_unused,
2902                          int unset __maybe_unused)
2903 {
2904         struct callchain_param *callchain = opt->value;
2905
2906         callchain->enabled = true;
2907
2908         if (callchain->record_mode == CALLCHAIN_NONE)
2909                 callchain->record_mode = CALLCHAIN_FP;
2910
2911         callchain_debug(callchain);
2912         return 0;
2913 }
2914
2915 static int perf_record_config(const char *var, const char *value, void *cb)
2916 {
2917         struct record *rec = cb;
2918
2919         if (!strcmp(var, "record.build-id")) {
2920                 if (!strcmp(value, "cache"))
2921                         rec->no_buildid_cache = false;
2922                 else if (!strcmp(value, "no-cache"))
2923                         rec->no_buildid_cache = true;
2924                 else if (!strcmp(value, "skip"))
2925                         rec->no_buildid = true;
2926                 else if (!strcmp(value, "mmap"))
2927                         rec->buildid_mmap = true;
2928                 else
2929                         return -1;
2930                 return 0;
2931         }
2932         if (!strcmp(var, "record.call-graph")) {
2933                 var = "call-graph.record-mode";
2934                 return perf_default_config(var, value, cb);
2935         }
2936 #ifdef HAVE_AIO_SUPPORT
2937         if (!strcmp(var, "record.aio")) {
2938                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2939                 if (!rec->opts.nr_cblocks)
2940                         rec->opts.nr_cblocks = nr_cblocks_default;
2941         }
2942 #endif
2943         if (!strcmp(var, "record.debuginfod")) {
2944                 rec->debuginfod.urls = strdup(value);
2945                 if (!rec->debuginfod.urls)
2946                         return -ENOMEM;
2947                 rec->debuginfod.set = true;
2948         }
2949
2950         return 0;
2951 }
2952
2953 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2954 {
2955         struct record *rec = (struct record *)opt->value;
2956
2957         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2958 }
2959
2960 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2961 {
2962         struct record_opts *opts = (struct record_opts *)opt->value;
2963
2964         if (unset || !str)
2965                 return 0;
2966
2967         if (!strcasecmp(str, "node"))
2968                 opts->affinity = PERF_AFFINITY_NODE;
2969         else if (!strcasecmp(str, "cpu"))
2970                 opts->affinity = PERF_AFFINITY_CPU;
2971
2972         return 0;
2973 }
2974
2975 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2976 {
2977         mask->nbits = nr_bits;
2978         mask->bits = bitmap_zalloc(mask->nbits);
2979         if (!mask->bits)
2980                 return -ENOMEM;
2981
2982         return 0;
2983 }
2984
2985 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2986 {
2987         bitmap_free(mask->bits);
2988         mask->nbits = 0;
2989 }
2990
2991 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2992 {
2993         int ret;
2994
2995         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2996         if (ret) {
2997                 mask->affinity.bits = NULL;
2998                 return ret;
2999         }
3000
3001         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3002         if (ret) {
3003                 record__mmap_cpu_mask_free(&mask->maps);
3004                 mask->maps.bits = NULL;
3005         }
3006
3007         return ret;
3008 }
3009
3010 static void record__thread_mask_free(struct thread_mask *mask)
3011 {
3012         record__mmap_cpu_mask_free(&mask->maps);
3013         record__mmap_cpu_mask_free(&mask->affinity);
3014 }
3015
3016 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3017 {
3018         int s;
3019         struct record_opts *opts = opt->value;
3020
3021         if (unset || !str || !strlen(str)) {
3022                 opts->threads_spec = THREAD_SPEC__CPU;
3023         } else {
3024                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3025                         if (s == THREAD_SPEC__USER) {
3026                                 opts->threads_user_spec = strdup(str);
3027                                 if (!opts->threads_user_spec)
3028                                         return -ENOMEM;
3029                                 opts->threads_spec = THREAD_SPEC__USER;
3030                                 break;
3031                         }
3032                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3033                                 opts->threads_spec = s;
3034                                 break;
3035                         }
3036                 }
3037         }
3038
3039         if (opts->threads_spec == THREAD_SPEC__USER)
3040                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3041         else
3042                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3043
3044         return 0;
3045 }
3046
3047 static int parse_output_max_size(const struct option *opt,
3048                                  const char *str, int unset)
3049 {
3050         unsigned long *s = (unsigned long *)opt->value;
3051         static struct parse_tag tags_size[] = {
3052                 { .tag  = 'B', .mult = 1       },
3053                 { .tag  = 'K', .mult = 1 << 10 },
3054                 { .tag  = 'M', .mult = 1 << 20 },
3055                 { .tag  = 'G', .mult = 1 << 30 },
3056                 { .tag  = 0 },
3057         };
3058         unsigned long val;
3059
3060         if (unset) {
3061                 *s = 0;
3062                 return 0;
3063         }
3064
3065         val = parse_tag_value(str, tags_size);
3066         if (val != (unsigned long) -1) {
3067                 *s = val;
3068                 return 0;
3069         }
3070
3071         return -1;
3072 }
3073
3074 static int record__parse_mmap_pages(const struct option *opt,
3075                                     const char *str,
3076                                     int unset __maybe_unused)
3077 {
3078         struct record_opts *opts = opt->value;
3079         char *s, *p;
3080         unsigned int mmap_pages;
3081         int ret;
3082
3083         if (!str)
3084                 return -EINVAL;
3085
3086         s = strdup(str);
3087         if (!s)
3088                 return -ENOMEM;
3089
3090         p = strchr(s, ',');
3091         if (p)
3092                 *p = '\0';
3093
3094         if (*s) {
3095                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3096                 if (ret)
3097                         goto out_free;
3098                 opts->mmap_pages = mmap_pages;
3099         }
3100
3101         if (!p) {
3102                 ret = 0;
3103                 goto out_free;
3104         }
3105
3106         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3107         if (ret)
3108                 goto out_free;
3109
3110         opts->auxtrace_mmap_pages = mmap_pages;
3111
3112 out_free:
3113         free(s);
3114         return ret;
3115 }
3116
3117 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3118 {
3119 }
3120
3121 static int parse_control_option(const struct option *opt,
3122                                 const char *str,
3123                                 int unset __maybe_unused)
3124 {
3125         struct record_opts *opts = opt->value;
3126
3127         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3128 }
3129
3130 static void switch_output_size_warn(struct record *rec)
3131 {
3132         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3133         struct switch_output *s = &rec->switch_output;
3134
3135         wakeup_size /= 2;
3136
3137         if (s->size < wakeup_size) {
3138                 char buf[100];
3139
3140                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3141                 pr_warning("WARNING: switch-output data size lower than "
3142                            "wakeup kernel buffer size (%s) "
3143                            "expect bigger perf.data sizes\n", buf);
3144         }
3145 }
3146
3147 static int switch_output_setup(struct record *rec)
3148 {
3149         struct switch_output *s = &rec->switch_output;
3150         static struct parse_tag tags_size[] = {
3151                 { .tag  = 'B', .mult = 1       },
3152                 { .tag  = 'K', .mult = 1 << 10 },
3153                 { .tag  = 'M', .mult = 1 << 20 },
3154                 { .tag  = 'G', .mult = 1 << 30 },
3155                 { .tag  = 0 },
3156         };
3157         static struct parse_tag tags_time[] = {
3158                 { .tag  = 's', .mult = 1        },
3159                 { .tag  = 'm', .mult = 60       },
3160                 { .tag  = 'h', .mult = 60*60    },
3161                 { .tag  = 'd', .mult = 60*60*24 },
3162                 { .tag  = 0 },
3163         };
3164         unsigned long val;
3165
3166         /*
3167          * If we're using --switch-output-events, then we imply its 
3168          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3169          *  thread to its parent.
3170          */
3171         if (rec->switch_output_event_set) {
3172                 if (record__threads_enabled(rec)) {
3173                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3174                         return 0;
3175                 }
3176                 goto do_signal;
3177         }
3178
3179         if (!s->set)
3180                 return 0;
3181
3182         if (record__threads_enabled(rec)) {
3183                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3184                 return 0;
3185         }
3186
3187         if (!strcmp(s->str, "signal")) {
3188 do_signal:
3189                 s->signal = true;
3190                 pr_debug("switch-output with SIGUSR2 signal\n");
3191                 goto enabled;
3192         }
3193
3194         val = parse_tag_value(s->str, tags_size);
3195         if (val != (unsigned long) -1) {
3196                 s->size = val;
3197                 pr_debug("switch-output with %s size threshold\n", s->str);
3198                 goto enabled;
3199         }
3200
3201         val = parse_tag_value(s->str, tags_time);
3202         if (val != (unsigned long) -1) {
3203                 s->time = val;
3204                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3205                          s->str, s->time);
3206                 goto enabled;
3207         }
3208
3209         return -1;
3210
3211 enabled:
3212         rec->timestamp_filename = true;
3213         s->enabled              = true;
3214
3215         if (s->size && !rec->opts.no_buffering)
3216                 switch_output_size_warn(rec);
3217
3218         return 0;
3219 }
3220
3221 static const char * const __record_usage[] = {
3222         "perf record [<options>] [<command>]",
3223         "perf record [<options>] -- <command> [<options>]",
3224         NULL
3225 };
3226 const char * const *record_usage = __record_usage;
3227
3228 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3229                                   struct perf_sample *sample, struct machine *machine)
3230 {
3231         /*
3232          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3233          * no need to add them twice.
3234          */
3235         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3236                 return 0;
3237         return perf_event__process_mmap(tool, event, sample, machine);
3238 }
3239
3240 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3241                                    struct perf_sample *sample, struct machine *machine)
3242 {
3243         /*
3244          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3245          * no need to add them twice.
3246          */
3247         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3248                 return 0;
3249
3250         return perf_event__process_mmap2(tool, event, sample, machine);
3251 }
3252
3253 static int process_timestamp_boundary(struct perf_tool *tool,
3254                                       union perf_event *event __maybe_unused,
3255                                       struct perf_sample *sample,
3256                                       struct machine *machine __maybe_unused)
3257 {
3258         struct record *rec = container_of(tool, struct record, tool);
3259
3260         set_timestamp_boundary(rec, sample->time);
3261         return 0;
3262 }
3263
3264 static int parse_record_synth_option(const struct option *opt,
3265                                      const char *str,
3266                                      int unset __maybe_unused)
3267 {
3268         struct record_opts *opts = opt->value;
3269         char *p = strdup(str);
3270
3271         if (p == NULL)
3272                 return -1;
3273
3274         opts->synth = parse_synth_opt(p);
3275         free(p);
3276
3277         if (opts->synth < 0) {
3278                 pr_err("Invalid synth option: %s\n", str);
3279                 return -1;
3280         }
3281         return 0;
3282 }
3283
3284 /*
3285  * XXX Ideally would be local to cmd_record() and passed to a record__new
3286  * because we need to have access to it in record__exit, that is called
3287  * after cmd_record() exits, but since record_options need to be accessible to
3288  * builtin-script, leave it here.
3289  *
3290  * At least we don't ouch it in all the other functions here directly.
3291  *
3292  * Just say no to tons of global variables, sigh.
3293  */
3294 static struct record record = {
3295         .opts = {
3296                 .sample_time         = true,
3297                 .mmap_pages          = UINT_MAX,
3298                 .user_freq           = UINT_MAX,
3299                 .user_interval       = ULLONG_MAX,
3300                 .freq                = 4000,
3301                 .target              = {
3302                         .uses_mmap   = true,
3303                         .default_per_cpu = true,
3304                 },
3305                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3306                 .nr_threads_synthesize = 1,
3307                 .ctl_fd              = -1,
3308                 .ctl_fd_ack          = -1,
3309                 .synth               = PERF_SYNTH_ALL,
3310         },
3311         .tool = {
3312                 .sample         = process_sample_event,
3313                 .fork           = perf_event__process_fork,
3314                 .exit           = perf_event__process_exit,
3315                 .comm           = perf_event__process_comm,
3316                 .namespaces     = perf_event__process_namespaces,
3317                 .mmap           = build_id__process_mmap,
3318                 .mmap2          = build_id__process_mmap2,
3319                 .itrace_start   = process_timestamp_boundary,
3320                 .aux            = process_timestamp_boundary,
3321                 .ordered_events = true,
3322         },
3323 };
3324
3325 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3326         "\n\t\t\t\tDefault: fp";
3327
3328 static bool dry_run;
3329
3330 /*
3331  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3332  * with it and switch to use the library functions in perf_evlist that came
3333  * from builtin-record.c, i.e. use record_opts,
3334  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3335  * using pipes, etc.
3336  */
3337 static struct option __record_options[] = {
3338         OPT_CALLBACK('e', "event", &record.evlist, "event",
3339                      "event selector. use 'perf list' to list available events",
3340                      parse_events_option),
3341         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3342                      "event filter", parse_filter),
3343         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3344                            NULL, "don't record events from perf itself",
3345                            exclude_perf),
3346         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3347                     "record events on existing process id"),
3348         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3349                     "record events on existing thread id"),
3350         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3351                     "collect data with this RT SCHED_FIFO priority"),
3352         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3353                     "collect data without buffering"),
3354         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3355                     "collect raw sample records from all opened counters"),
3356         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3357                             "system-wide collection from all CPUs"),
3358         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3359                     "list of cpus to monitor"),
3360         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3361         OPT_STRING('o', "output", &record.data.path, "file",
3362                     "output file name"),
3363         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3364                         &record.opts.no_inherit_set,
3365                         "child tasks do not inherit counters"),
3366         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3367                     "synthesize non-sample events at the end of output"),
3368         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3369         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3370         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3371                     "Fail if the specified frequency can't be used"),
3372         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3373                      "profile at this frequency",
3374                       record__parse_freq),
3375         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3376                      "number of mmap data pages and AUX area tracing mmap pages",
3377                      record__parse_mmap_pages),
3378         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3379                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3380                      record__mmap_flush_parse),
3381         OPT_BOOLEAN(0, "group", &record.opts.group,
3382                     "put the counters into a counter group"),
3383         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3384                            NULL, "enables call-graph recording" ,
3385                            &record_callchain_opt),
3386         OPT_CALLBACK(0, "call-graph", &record.opts,
3387                      "record_mode[,record_size]", record_callchain_help,
3388                      &record_parse_callchain_opt),
3389         OPT_INCR('v', "verbose", &verbose,
3390                     "be more verbose (show counter open errors, etc)"),
3391         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3392         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3393                     "per thread counts"),
3394         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3395         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3396                     "Record the sample physical addresses"),
3397         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3398                     "Record the sampled data address data page size"),
3399         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3400                     "Record the sampled code address (ip) page size"),
3401         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3402         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3403                     "Record the sample identifier"),
3404         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3405                         &record.opts.sample_time_set,
3406                         "Record the sample timestamps"),
3407         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3408                         "Record the sample period"),
3409         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3410                     "don't sample"),
3411         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3412                         &record.no_buildid_cache_set,
3413                         "do not update the buildid cache"),
3414         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3415                         &record.no_buildid_set,
3416                         "do not collect buildids in perf.data"),
3417         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3418                      "monitor event in cgroup name only",
3419                      parse_cgroups),
3420         OPT_CALLBACK('D', "delay", &record, "ms",
3421                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3422                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3423                      record__parse_event_enable_time),
3424         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3425         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3426                    "user to profile"),
3427
3428         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3429                      "branch any", "sample any taken branches",
3430                      parse_branch_stack),
3431
3432         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3433                      "branch filter mask", "branch stack filter modes",
3434                      parse_branch_stack),
3435         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3436                     "sample by weight (on special events only)"),
3437         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3438                     "sample transaction flags (special events only)"),
3439         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3440                     "use per-thread mmaps"),
3441         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3442                     "sample selected machine registers on interrupt,"
3443                     " use '-I?' to list register names", parse_intr_regs),
3444         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3445                     "sample selected machine registers on interrupt,"
3446                     " use '--user-regs=?' to list register names", parse_user_regs),
3447         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3448                     "Record running/enabled time of read (:S) events"),
3449         OPT_CALLBACK('k', "clockid", &record.opts,
3450         "clockid", "clockid to use for events, see clock_gettime()",
3451         parse_clockid),
3452         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3453                           "opts", "AUX area tracing Snapshot Mode", ""),
3454         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3455                           "opts", "sample AUX area", ""),
3456         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3457                         "per thread proc mmap processing timeout in ms"),
3458         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3459                     "Record namespaces events"),
3460         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3461                     "Record cgroup events"),
3462         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3463                         &record.opts.record_switch_events_set,
3464                         "Record context switch events"),
3465         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3466                          "Configure all used events to run in kernel space.",
3467                          PARSE_OPT_EXCLUSIVE),
3468         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3469                          "Configure all used events to run in user space.",
3470                          PARSE_OPT_EXCLUSIVE),
3471         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3472                     "collect kernel callchains"),
3473         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3474                     "collect user callchains"),
3475         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3476                    "clang binary to use for compiling BPF scriptlets"),
3477         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3478                    "options passed to clang when compiling BPF scriptlets"),
3479         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3480                    "file", "vmlinux pathname"),
3481         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3482                     "Record build-id of all DSOs regardless of hits"),
3483         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3484                     "Record build-id in map events"),
3485         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3486                     "append timestamp to output filename"),
3487         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3488                     "Record timestamp boundary (time of first/last samples)"),
3489         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3490                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3491                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3492                           "signal"),
3493         OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3494                          "switch output event selector. use 'perf list' to list available events",
3495                          parse_events_option_new_evlist),
3496         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3497                    "Limit number of switch output generated files"),
3498         OPT_BOOLEAN(0, "dry-run", &dry_run,
3499                     "Parse options then exit"),
3500 #ifdef HAVE_AIO_SUPPORT
3501         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3502                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3503                      record__aio_parse),
3504 #endif
3505         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3506                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3507                      record__parse_affinity),
3508 #ifdef HAVE_ZSTD_SUPPORT
3509         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3510                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3511                             record__parse_comp_level),
3512 #endif
3513         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3514                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3515         OPT_UINTEGER(0, "num-thread-synthesize",
3516                      &record.opts.nr_threads_synthesize,
3517                      "number of threads to run for event synthesis"),
3518 #ifdef HAVE_LIBPFM
3519         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3520                 "libpfm4 event selector. use 'perf list' to list available events",
3521                 parse_libpfm_events_option),
3522 #endif
3523         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3524                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3525                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3526                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3527                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3528                       parse_control_option),
3529         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3530                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3531         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3532                           &record.debuginfod.set, "debuginfod urls",
3533                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3534                           "system"),
3535         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3536                             "write collected trace data into several data files using parallel threads",
3537                             record__parse_threads),
3538         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3539         OPT_END()
3540 };
3541
3542 struct option *record_options = __record_options;
3543
3544 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3545 {
3546         struct perf_cpu cpu;
3547         int idx;
3548
3549         if (cpu_map__is_dummy(cpus))
3550                 return 0;
3551
3552         perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3553                 if (cpu.cpu == -1)
3554                         continue;
3555                 /* Return ENODEV is input cpu is greater than max cpu */
3556                 if ((unsigned long)cpu.cpu > mask->nbits)
3557                         return -ENODEV;
3558                 set_bit(cpu.cpu, mask->bits);
3559         }
3560
3561         return 0;
3562 }
3563
3564 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3565 {
3566         struct perf_cpu_map *cpus;
3567
3568         cpus = perf_cpu_map__new(mask_spec);
3569         if (!cpus)
3570                 return -ENOMEM;
3571
3572         bitmap_zero(mask->bits, mask->nbits);
3573         if (record__mmap_cpu_mask_init(mask, cpus))
3574                 return -ENODEV;
3575
3576         perf_cpu_map__put(cpus);
3577
3578         return 0;
3579 }
3580
3581 static void record__free_thread_masks(struct record *rec, int nr_threads)
3582 {
3583         int t;
3584
3585         if (rec->thread_masks)
3586                 for (t = 0; t < nr_threads; t++)
3587                         record__thread_mask_free(&rec->thread_masks[t]);
3588
3589         zfree(&rec->thread_masks);
3590 }
3591
3592 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3593 {
3594         int t, ret;
3595
3596         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3597         if (!rec->thread_masks) {
3598                 pr_err("Failed to allocate thread masks\n");
3599                 return -ENOMEM;
3600         }
3601
3602         for (t = 0; t < nr_threads; t++) {
3603                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3604                 if (ret) {
3605                         pr_err("Failed to allocate thread masks[%d]\n", t);
3606                         goto out_free;
3607                 }
3608         }
3609
3610         return 0;
3611
3612 out_free:
3613         record__free_thread_masks(rec, nr_threads);
3614
3615         return ret;
3616 }
3617
3618 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3619 {
3620         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3621
3622         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3623         if (ret)
3624                 return ret;
3625
3626         rec->nr_threads = nr_cpus;
3627         pr_debug("nr_threads: %d\n", rec->nr_threads);
3628
3629         for (t = 0; t < rec->nr_threads; t++) {
3630                 set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3631                 set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3632                 if (verbose) {
3633                         pr_debug("thread_masks[%d]: ", t);
3634                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3635                         pr_debug("thread_masks[%d]: ", t);
3636                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3637                 }
3638         }
3639
3640         return 0;
3641 }
3642
3643 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3644                                           const char **maps_spec, const char **affinity_spec,
3645                                           u32 nr_spec)
3646 {
3647         u32 s;
3648         int ret = 0, t = 0;
3649         struct mmap_cpu_mask cpus_mask;
3650         struct thread_mask thread_mask, full_mask, *thread_masks;
3651
3652         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3653         if (ret) {
3654                 pr_err("Failed to allocate CPUs mask\n");
3655                 return ret;
3656         }
3657
3658         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3659         if (ret) {
3660                 pr_err("Failed to init cpu mask\n");
3661                 goto out_free_cpu_mask;
3662         }
3663
3664         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3665         if (ret) {
3666                 pr_err("Failed to allocate full mask\n");
3667                 goto out_free_cpu_mask;
3668         }
3669
3670         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3671         if (ret) {
3672                 pr_err("Failed to allocate thread mask\n");
3673                 goto out_free_full_and_cpu_masks;
3674         }
3675
3676         for (s = 0; s < nr_spec; s++) {
3677                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3678                 if (ret) {
3679                         pr_err("Failed to initialize maps thread mask\n");
3680                         goto out_free;
3681                 }
3682                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3683                 if (ret) {
3684                         pr_err("Failed to initialize affinity thread mask\n");
3685                         goto out_free;
3686                 }
3687
3688                 /* ignore invalid CPUs but do not allow empty masks */
3689                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3690                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3691                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3692                         ret = -EINVAL;
3693                         goto out_free;
3694                 }
3695                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3696                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3697                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3698                         ret = -EINVAL;
3699                         goto out_free;
3700                 }
3701
3702                 /* do not allow intersection with other masks (full_mask) */
3703                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3704                                       thread_mask.maps.nbits)) {
3705                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3706                         ret = -EINVAL;
3707                         goto out_free;
3708                 }
3709                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3710                                       thread_mask.affinity.nbits)) {
3711                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3712                         ret = -EINVAL;
3713                         goto out_free;
3714                 }
3715
3716                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3717                           thread_mask.maps.bits, full_mask.maps.nbits);
3718                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3719                           thread_mask.affinity.bits, full_mask.maps.nbits);
3720
3721                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3722                 if (!thread_masks) {
3723                         pr_err("Failed to reallocate thread masks\n");
3724                         ret = -ENOMEM;
3725                         goto out_free;
3726                 }
3727                 rec->thread_masks = thread_masks;
3728                 rec->thread_masks[t] = thread_mask;
3729                 if (verbose) {
3730                         pr_debug("thread_masks[%d]: ", t);
3731                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3732                         pr_debug("thread_masks[%d]: ", t);
3733                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3734                 }
3735                 t++;
3736                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3737                 if (ret) {
3738                         pr_err("Failed to allocate thread mask\n");
3739                         goto out_free_full_and_cpu_masks;
3740                 }
3741         }
3742         rec->nr_threads = t;
3743         pr_debug("nr_threads: %d\n", rec->nr_threads);
3744         if (!rec->nr_threads)
3745                 ret = -EINVAL;
3746
3747 out_free:
3748         record__thread_mask_free(&thread_mask);
3749 out_free_full_and_cpu_masks:
3750         record__thread_mask_free(&full_mask);
3751 out_free_cpu_mask:
3752         record__mmap_cpu_mask_free(&cpus_mask);
3753
3754         return ret;
3755 }
3756
3757 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3758 {
3759         int ret;
3760         struct cpu_topology *topo;
3761
3762         topo = cpu_topology__new();
3763         if (!topo) {
3764                 pr_err("Failed to allocate CPU topology\n");
3765                 return -ENOMEM;
3766         }
3767
3768         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3769                                              topo->core_cpus_list, topo->core_cpus_lists);
3770         cpu_topology__delete(topo);
3771
3772         return ret;
3773 }
3774
3775 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3776 {
3777         int ret;
3778         struct cpu_topology *topo;
3779
3780         topo = cpu_topology__new();
3781         if (!topo) {
3782                 pr_err("Failed to allocate CPU topology\n");
3783                 return -ENOMEM;
3784         }
3785
3786         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3787                                              topo->package_cpus_list, topo->package_cpus_lists);
3788         cpu_topology__delete(topo);
3789
3790         return ret;
3791 }
3792
3793 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3794 {
3795         u32 s;
3796         int ret;
3797         const char **spec;
3798         struct numa_topology *topo;
3799
3800         topo = numa_topology__new();
3801         if (!topo) {
3802                 pr_err("Failed to allocate NUMA topology\n");
3803                 return -ENOMEM;
3804         }
3805
3806         spec = zalloc(topo->nr * sizeof(char *));
3807         if (!spec) {
3808                 pr_err("Failed to allocate NUMA spec\n");
3809                 ret = -ENOMEM;
3810                 goto out_delete_topo;
3811         }
3812         for (s = 0; s < topo->nr; s++)
3813                 spec[s] = topo->nodes[s].cpus;
3814
3815         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3816
3817         zfree(&spec);
3818
3819 out_delete_topo:
3820         numa_topology__delete(topo);
3821
3822         return ret;
3823 }
3824
3825 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3826 {
3827         int t, ret;
3828         u32 s, nr_spec = 0;
3829         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3830         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3831
3832         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3833                 spec = strtok_r(user_spec, ":", &spec_ptr);
3834                 if (spec == NULL)
3835                         break;
3836                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3837                 mask = strtok_r(spec, "/", &mask_ptr);
3838                 if (mask == NULL)
3839                         break;
3840                 pr_debug2("  maps mask: %s\n", mask);
3841                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3842                 if (!tmp_spec) {
3843                         pr_err("Failed to reallocate maps spec\n");
3844                         ret = -ENOMEM;
3845                         goto out_free;
3846                 }
3847                 maps_spec = tmp_spec;
3848                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3849                 if (!maps_spec[nr_spec]) {
3850                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3851                         ret = -ENOMEM;
3852                         goto out_free;
3853                 }
3854                 mask = strtok_r(NULL, "/", &mask_ptr);
3855                 if (mask == NULL) {
3856                         pr_err("Invalid thread maps or affinity specs\n");
3857                         ret = -EINVAL;
3858                         goto out_free;
3859                 }
3860                 pr_debug2("  affinity mask: %s\n", mask);
3861                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3862                 if (!tmp_spec) {
3863                         pr_err("Failed to reallocate affinity spec\n");
3864                         ret = -ENOMEM;
3865                         goto out_free;
3866                 }
3867                 affinity_spec = tmp_spec;
3868                 affinity_spec[nr_spec] = strdup(mask);
3869                 if (!affinity_spec[nr_spec]) {
3870                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3871                         ret = -ENOMEM;
3872                         goto out_free;
3873                 }
3874                 dup_mask = NULL;
3875                 nr_spec++;
3876         }
3877
3878         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3879                                              (const char **)affinity_spec, nr_spec);
3880
3881 out_free:
3882         free(dup_mask);
3883         for (s = 0; s < nr_spec; s++) {
3884                 if (maps_spec)
3885                         free(maps_spec[s]);
3886                 if (affinity_spec)
3887                         free(affinity_spec[s]);
3888         }
3889         free(affinity_spec);
3890         free(maps_spec);
3891
3892         return ret;
3893 }
3894
3895 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3896 {
3897         int ret;
3898
3899         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3900         if (ret)
3901                 return ret;
3902
3903         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3904                 return -ENODEV;
3905
3906         rec->nr_threads = 1;
3907
3908         return 0;
3909 }
3910
3911 static int record__init_thread_masks(struct record *rec)
3912 {
3913         int ret = 0;
3914         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3915
3916         if (!record__threads_enabled(rec))
3917                 return record__init_thread_default_masks(rec, cpus);
3918
3919         if (evlist__per_thread(rec->evlist)) {
3920                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3921                 return -EINVAL;
3922         }
3923
3924         switch (rec->opts.threads_spec) {
3925         case THREAD_SPEC__CPU:
3926                 ret = record__init_thread_cpu_masks(rec, cpus);
3927                 break;
3928         case THREAD_SPEC__CORE:
3929                 ret = record__init_thread_core_masks(rec, cpus);
3930                 break;
3931         case THREAD_SPEC__PACKAGE:
3932                 ret = record__init_thread_package_masks(rec, cpus);
3933                 break;
3934         case THREAD_SPEC__NUMA:
3935                 ret = record__init_thread_numa_masks(rec, cpus);
3936                 break;
3937         case THREAD_SPEC__USER:
3938                 ret = record__init_thread_user_masks(rec, cpus);
3939                 break;
3940         default:
3941                 break;
3942         }
3943
3944         return ret;
3945 }
3946
3947 int cmd_record(int argc, const char **argv)
3948 {
3949         int err;
3950         struct record *rec = &record;
3951         char errbuf[BUFSIZ];
3952
3953         setlocale(LC_ALL, "");
3954
3955 #ifndef HAVE_LIBBPF_SUPPORT
3956 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3957         set_nobuild('\0', "clang-path", true);
3958         set_nobuild('\0', "clang-opt", true);
3959 # undef set_nobuild
3960 #endif
3961
3962 #ifndef HAVE_BPF_PROLOGUE
3963 # if !defined (HAVE_DWARF_SUPPORT)
3964 #  define REASON  "NO_DWARF=1"
3965 # elif !defined (HAVE_LIBBPF_SUPPORT)
3966 #  define REASON  "NO_LIBBPF=1"
3967 # else
3968 #  define REASON  "this architecture doesn't support BPF prologue"
3969 # endif
3970 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3971         set_nobuild('\0', "vmlinux", true);
3972 # undef set_nobuild
3973 # undef REASON
3974 #endif
3975
3976 #ifndef HAVE_BPF_SKEL
3977 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3978         set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3979 # undef set_nobuild
3980 #endif
3981
3982         rec->opts.affinity = PERF_AFFINITY_SYS;
3983
3984         rec->evlist = evlist__new();
3985         if (rec->evlist == NULL)
3986                 return -ENOMEM;
3987
3988         err = perf_config(perf_record_config, rec);
3989         if (err)
3990                 return err;
3991
3992         argc = parse_options(argc, argv, record_options, record_usage,
3993                             PARSE_OPT_STOP_AT_NON_OPTION);
3994         if (quiet)
3995                 perf_quiet_option();
3996
3997         err = symbol__validate_sym_arguments();
3998         if (err)
3999                 return err;
4000
4001         perf_debuginfod_setup(&record.debuginfod);
4002
4003         /* Make system wide (-a) the default target. */
4004         if (!argc && target__none(&rec->opts.target))
4005                 rec->opts.target.system_wide = true;
4006
4007         if (nr_cgroups && !rec->opts.target.system_wide) {
4008                 usage_with_options_msg(record_usage, record_options,
4009                         "cgroup monitoring only available in system-wide mode");
4010
4011         }
4012
4013         if (rec->buildid_mmap) {
4014                 if (!perf_can_record_build_id()) {
4015                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4016                         err = -EINVAL;
4017                         goto out_opts;
4018                 }
4019                 pr_debug("Enabling build id in mmap2 events.\n");
4020                 /* Enable mmap build id synthesizing. */
4021                 symbol_conf.buildid_mmap2 = true;
4022                 /* Enable perf_event_attr::build_id bit. */
4023                 rec->opts.build_id = true;
4024                 /* Disable build id cache. */
4025                 rec->no_buildid = true;
4026         }
4027
4028         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4029                 pr_err("Kernel has no cgroup sampling support.\n");
4030                 err = -EINVAL;
4031                 goto out_opts;
4032         }
4033
4034         if (rec->opts.kcore)
4035                 rec->opts.text_poke = true;
4036
4037         if (rec->opts.kcore || record__threads_enabled(rec))
4038                 rec->data.is_dir = true;
4039
4040         if (record__threads_enabled(rec)) {
4041                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4042                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4043                         goto out_opts;
4044                 }
4045                 if (record__aio_enabled(rec)) {
4046                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4047                         goto out_opts;
4048                 }
4049         }
4050
4051         if (rec->opts.comp_level != 0) {
4052                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4053                 rec->no_buildid = true;
4054         }
4055
4056         if (rec->opts.record_switch_events &&
4057             !perf_can_record_switch_events()) {
4058                 ui__error("kernel does not support recording context switch events\n");
4059                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4060                 err = -EINVAL;
4061                 goto out_opts;
4062         }
4063
4064         if (switch_output_setup(rec)) {
4065                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4066                 err = -EINVAL;
4067                 goto out_opts;
4068         }
4069
4070         if (rec->switch_output.time) {
4071                 signal(SIGALRM, alarm_sig_handler);
4072                 alarm(rec->switch_output.time);
4073         }
4074
4075         if (rec->switch_output.num_files) {
4076                 rec->switch_output.filenames = calloc(sizeof(char *),
4077                                                       rec->switch_output.num_files);
4078                 if (!rec->switch_output.filenames) {
4079                         err = -EINVAL;
4080                         goto out_opts;
4081                 }
4082         }
4083
4084         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4085                 rec->timestamp_filename = false;
4086                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4087         }
4088
4089         /*
4090          * Allow aliases to facilitate the lookup of symbols for address
4091          * filters. Refer to auxtrace_parse_filters().
4092          */
4093         symbol_conf.allow_aliases = true;
4094
4095         symbol__init(NULL);
4096
4097         err = record__auxtrace_init(rec);
4098         if (err)
4099                 goto out;
4100
4101         if (dry_run)
4102                 goto out;
4103
4104         err = bpf__setup_stdout(rec->evlist);
4105         if (err) {
4106                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4107                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
4108                          errbuf);
4109                 goto out;
4110         }
4111
4112         err = -ENOMEM;
4113
4114         if (rec->no_buildid_cache || rec->no_buildid) {
4115                 disable_buildid_cache();
4116         } else if (rec->switch_output.enabled) {
4117                 /*
4118                  * In 'perf record --switch-output', disable buildid
4119                  * generation by default to reduce data file switching
4120                  * overhead. Still generate buildid if they are required
4121                  * explicitly using
4122                  *
4123                  *  perf record --switch-output --no-no-buildid \
4124                  *              --no-no-buildid-cache
4125                  *
4126                  * Following code equals to:
4127                  *
4128                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4129                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4130                  *         disable_buildid_cache();
4131                  */
4132                 bool disable = true;
4133
4134                 if (rec->no_buildid_set && !rec->no_buildid)
4135                         disable = false;
4136                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4137                         disable = false;
4138                 if (disable) {
4139                         rec->no_buildid = true;
4140                         rec->no_buildid_cache = true;
4141                         disable_buildid_cache();
4142                 }
4143         }
4144
4145         if (record.opts.overwrite)
4146                 record.opts.tail_synthesize = true;
4147
4148         if (rec->evlist->core.nr_entries == 0) {
4149                 if (perf_pmu__has_hybrid()) {
4150                         err = evlist__add_default_hybrid(rec->evlist,
4151                                                          !record.opts.no_samples);
4152                 } else {
4153                         err = __evlist__add_default(rec->evlist,
4154                                                     !record.opts.no_samples);
4155                 }
4156
4157                 if (err < 0) {
4158                         pr_err("Not enough memory for event selector list\n");
4159                         goto out;
4160                 }
4161         }
4162
4163         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4164                 rec->opts.no_inherit = true;
4165
4166         err = target__validate(&rec->opts.target);
4167         if (err) {
4168                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4169                 ui__warning("%s\n", errbuf);
4170         }
4171
4172         err = target__parse_uid(&rec->opts.target);
4173         if (err) {
4174                 int saved_errno = errno;
4175
4176                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4177                 ui__error("%s", errbuf);
4178
4179                 err = -saved_errno;
4180                 goto out;
4181         }
4182
4183         /* Enable ignoring missing threads when -u/-p option is defined. */
4184         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4185
4186         if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4187                 pr_err("failed to use cpu list %s\n",
4188                        rec->opts.target.cpu_list);
4189                 goto out;
4190         }
4191
4192         rec->opts.target.hybrid = perf_pmu__has_hybrid();
4193
4194         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4195                 arch__add_leaf_frame_record_opts(&rec->opts);
4196
4197         err = -ENOMEM;
4198         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4199                 if (rec->opts.target.pid != NULL) {
4200                         pr_err("Couldn't create thread/CPU maps: %s\n",
4201                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4202                         goto out;
4203                 }
4204                 else
4205                         usage_with_options(record_usage, record_options);
4206         }
4207
4208         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4209         if (err)
4210                 goto out;
4211
4212         /*
4213          * We take all buildids when the file contains
4214          * AUX area tracing data because we do not decode the
4215          * trace because it would take too long.
4216          */
4217         if (rec->opts.full_auxtrace)
4218                 rec->buildid_all = true;
4219
4220         if (rec->opts.text_poke) {
4221                 err = record__config_text_poke(rec->evlist);
4222                 if (err) {
4223                         pr_err("record__config_text_poke failed, error %d\n", err);
4224                         goto out;
4225                 }
4226         }
4227
4228         if (rec->off_cpu) {
4229                 err = record__config_off_cpu(rec);
4230                 if (err) {
4231                         pr_err("record__config_off_cpu failed, error %d\n", err);
4232                         goto out;
4233                 }
4234         }
4235
4236         if (record_opts__config(&rec->opts)) {
4237                 err = -EINVAL;
4238                 goto out;
4239         }
4240
4241         err = record__init_thread_masks(rec);
4242         if (err) {
4243                 pr_err("Failed to initialize parallel data streaming masks\n");
4244                 goto out;
4245         }
4246
4247         if (rec->opts.nr_cblocks > nr_cblocks_max)
4248                 rec->opts.nr_cblocks = nr_cblocks_max;
4249         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4250
4251         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4252         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4253
4254         if (rec->opts.comp_level > comp_level_max)
4255                 rec->opts.comp_level = comp_level_max;
4256         pr_debug("comp level: %d\n", rec->opts.comp_level);
4257
4258         err = __cmd_record(&record, argc, argv);
4259 out:
4260         evlist__delete(rec->evlist);
4261         symbol__exit();
4262         auxtrace_record__free(rec->itr);
4263 out_opts:
4264         record__free_thread_masks(rec, rec->nr_threads);
4265         rec->nr_threads = 0;
4266         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4267         return err;
4268 }
4269
4270 static void snapshot_sig_handler(int sig __maybe_unused)
4271 {
4272         struct record *rec = &record;
4273
4274         hit_auxtrace_snapshot_trigger(rec);
4275
4276         if (switch_output_signal(rec))
4277                 trigger_hit(&switch_output_trigger);
4278 }
4279
4280 static void alarm_sig_handler(int sig __maybe_unused)
4281 {
4282         struct record *rec = &record;
4283
4284         if (switch_output_time(rec))
4285                 trigger_hit(&switch_output_trigger);
4286 }