Merge tag 'pull-nfsd-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[platform/kernel/linux-starfive.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83
84 struct switch_output {
85         bool             enabled;
86         bool             signal;
87         unsigned long    size;
88         unsigned long    time;
89         const char      *str;
90         bool             set;
91         char             **filenames;
92         int              num_files;
93         int              cur_file;
94 };
95
96 struct thread_mask {
97         struct mmap_cpu_mask    maps;
98         struct mmap_cpu_mask    affinity;
99 };
100
101 struct record_thread {
102         pid_t                   tid;
103         struct thread_mask      *mask;
104         struct {
105                 int             msg[2];
106                 int             ack[2];
107         } pipes;
108         struct fdarray          pollfd;
109         int                     ctlfd_pos;
110         int                     nr_mmaps;
111         struct mmap             **maps;
112         struct mmap             **overwrite_maps;
113         struct record           *rec;
114         unsigned long long      samples;
115         unsigned long           waking;
116         u64                     bytes_written;
117         u64                     bytes_transferred;
118         u64                     bytes_compressed;
119 };
120
121 static __thread struct record_thread *thread;
122
123 enum thread_msg {
124         THREAD_MSG__UNDEFINED = 0,
125         THREAD_MSG__READY,
126         THREAD_MSG__MAX,
127 };
128
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130         "UNDEFINED", "READY"
131 };
132
133 enum thread_spec {
134         THREAD_SPEC__UNDEFINED = 0,
135         THREAD_SPEC__CPU,
136         THREAD_SPEC__CORE,
137         THREAD_SPEC__PACKAGE,
138         THREAD_SPEC__NUMA,
139         THREAD_SPEC__USER,
140         THREAD_SPEC__MAX,
141 };
142
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144         "undefined", "cpu", "core", "package", "numa", "user"
145 };
146
147 struct pollfd_index_map {
148         int evlist_pollfd_index;
149         int thread_pollfd_index;
150 };
151
152 struct record {
153         struct perf_tool        tool;
154         struct record_opts      opts;
155         u64                     bytes_written;
156         u64                     thread_bytes_written;
157         struct perf_data        data;
158         struct auxtrace_record  *itr;
159         struct evlist   *evlist;
160         struct perf_session     *session;
161         struct evlist           *sb_evlist;
162         pthread_t               thread_id;
163         int                     realtime_prio;
164         bool                    switch_output_event_set;
165         bool                    no_buildid;
166         bool                    no_buildid_set;
167         bool                    no_buildid_cache;
168         bool                    no_buildid_cache_set;
169         bool                    buildid_all;
170         bool                    buildid_mmap;
171         bool                    timestamp_filename;
172         bool                    timestamp_boundary;
173         bool                    off_cpu;
174         struct switch_output    switch_output;
175         unsigned long long      samples;
176         unsigned long           output_max_size;        /* = 0: unlimited */
177         struct perf_debuginfod  debuginfod;
178         int                     nr_threads;
179         struct thread_mask      *thread_masks;
180         struct record_thread    *thread_data;
181         struct pollfd_index_map *index_map;
182         size_t                  index_map_sz;
183         size_t                  index_map_cnt;
184 };
185
186 static volatile int done;
187
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193         "SYS", "NODE", "CPU"
194 };
195
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199         return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202
203 static int record__threads_enabled(struct record *rec)
204 {
205         return rec->opts.threads_spec;
206 }
207
208 static bool switch_output_signal(struct record *rec)
209 {
210         return rec->switch_output.signal &&
211                trigger_is_ready(&switch_output_trigger);
212 }
213
214 static bool switch_output_size(struct record *rec)
215 {
216         return rec->switch_output.size &&
217                trigger_is_ready(&switch_output_trigger) &&
218                (rec->bytes_written >= rec->switch_output.size);
219 }
220
221 static bool switch_output_time(struct record *rec)
222 {
223         return rec->switch_output.time &&
224                trigger_is_ready(&switch_output_trigger);
225 }
226
227 static u64 record__bytes_written(struct record *rec)
228 {
229         return rec->bytes_written + rec->thread_bytes_written;
230 }
231
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234         return rec->output_max_size &&
235                (record__bytes_written(rec) >= rec->output_max_size);
236 }
237
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239                          void *bf, size_t size)
240 {
241         struct perf_data_file *file = &rec->session->data->file;
242
243         if (map && map->file)
244                 file = map->file;
245
246         if (perf_data_file__write(file, bf, size) < 0) {
247                 pr_err("failed to write perf data, error: %m\n");
248                 return -1;
249         }
250
251         if (map && map->file) {
252                 thread->bytes_written += size;
253                 rec->thread_bytes_written += size;
254         } else {
255                 rec->bytes_written += size;
256         }
257
258         if (record__output_max_size_exceeded(rec) && !done) {
259                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260                                 " stopping session ]\n",
261                                 record__bytes_written(rec) >> 10);
262                 done = 1;
263         }
264
265         if (switch_output_size(rec))
266                 trigger_hit(&switch_output_trigger);
267
268         return 0;
269 }
270
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
274                             void *dst, size_t dst_size, void *src, size_t src_size);
275
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278                 void *buf, size_t size, off_t off)
279 {
280         int rc;
281
282         cblock->aio_fildes = trace_fd;
283         cblock->aio_buf    = buf;
284         cblock->aio_nbytes = size;
285         cblock->aio_offset = off;
286         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287
288         do {
289                 rc = aio_write(cblock);
290                 if (rc == 0) {
291                         break;
292                 } else if (errno != EAGAIN) {
293                         cblock->aio_fildes = -1;
294                         pr_err("failed to queue perf data, error: %m\n");
295                         break;
296                 }
297         } while (1);
298
299         return rc;
300 }
301
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304         void *rem_buf;
305         off_t rem_off;
306         size_t rem_size;
307         int rc, aio_errno;
308         ssize_t aio_ret, written;
309
310         aio_errno = aio_error(cblock);
311         if (aio_errno == EINPROGRESS)
312                 return 0;
313
314         written = aio_ret = aio_return(cblock);
315         if (aio_ret < 0) {
316                 if (aio_errno != EINTR)
317                         pr_err("failed to write perf data, error: %m\n");
318                 written = 0;
319         }
320
321         rem_size = cblock->aio_nbytes - written;
322
323         if (rem_size == 0) {
324                 cblock->aio_fildes = -1;
325                 /*
326                  * md->refcount is incremented in record__aio_pushfn() for
327                  * every aio write request started in record__aio_push() so
328                  * decrement it because the request is now complete.
329                  */
330                 perf_mmap__put(&md->core);
331                 rc = 1;
332         } else {
333                 /*
334                  * aio write request may require restart with the
335                  * reminder if the kernel didn't write whole
336                  * chunk at once.
337                  */
338                 rem_off = cblock->aio_offset + written;
339                 rem_buf = (void *)(cblock->aio_buf + written);
340                 record__aio_write(cblock, cblock->aio_fildes,
341                                 rem_buf, rem_size, rem_off);
342                 rc = 0;
343         }
344
345         return rc;
346 }
347
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350         struct aiocb **aiocb = md->aio.aiocb;
351         struct aiocb *cblocks = md->aio.cblocks;
352         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353         int i, do_suspend;
354
355         do {
356                 do_suspend = 0;
357                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
358                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359                                 if (sync_all)
360                                         aiocb[i] = NULL;
361                                 else
362                                         return i;
363                         } else {
364                                 /*
365                                  * Started aio write is not complete yet
366                                  * so it has to be waited before the
367                                  * next allocation.
368                                  */
369                                 aiocb[i] = &cblocks[i];
370                                 do_suspend = 1;
371                         }
372                 }
373                 if (!do_suspend)
374                         return -1;
375
376                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377                         if (!(errno == EAGAIN || errno == EINTR))
378                                 pr_err("failed to sync perf data, error: %m\n");
379                 }
380         } while (1);
381 }
382
383 struct record_aio {
384         struct record   *rec;
385         void            *data;
386         size_t          size;
387 };
388
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391         struct record_aio *aio = to;
392
393         /*
394          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395          * to release space in the kernel buffer as fast as possible, calling
396          * perf_mmap__consume() from perf_mmap__push() function.
397          *
398          * That lets the kernel to proceed with storing more profiling data into
399          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400          *
401          * Coping can be done in two steps in case the chunk of profiling data
402          * crosses the upper bound of the kernel buffer. In this case we first move
403          * part of data from map->start till the upper bound and then the reminder
404          * from the beginning of the kernel buffer till the end of the data chunk.
405          */
406
407         if (record__comp_enabled(aio->rec)) {
408                 size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409                                      mmap__mmap_len(map) - aio->size,
410                                      buf, size);
411         } else {
412                 memcpy(aio->data + aio->size, buf, size);
413         }
414
415         if (!aio->size) {
416                 /*
417                  * Increment map->refcount to guard map->aio.data[] buffer
418                  * from premature deallocation because map object can be
419                  * released earlier than aio write request started on
420                  * map->aio.data[] buffer is complete.
421                  *
422                  * perf_mmap__put() is done at record__aio_complete()
423                  * after started aio request completion or at record__aio_push()
424                  * if the request failed to start.
425                  */
426                 perf_mmap__get(&map->core);
427         }
428
429         aio->size += size;
430
431         return size;
432 }
433
434 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
435 {
436         int ret, idx;
437         int trace_fd = rec->session->data->file.fd;
438         struct record_aio aio = { .rec = rec, .size = 0 };
439
440         /*
441          * Call record__aio_sync() to wait till map->aio.data[] buffer
442          * becomes available after previous aio write operation.
443          */
444
445         idx = record__aio_sync(map, false);
446         aio.data = map->aio.data[idx];
447         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
448         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
449                 return ret;
450
451         rec->samples++;
452         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
453         if (!ret) {
454                 *off += aio.size;
455                 rec->bytes_written += aio.size;
456                 if (switch_output_size(rec))
457                         trigger_hit(&switch_output_trigger);
458         } else {
459                 /*
460                  * Decrement map->refcount incremented in record__aio_pushfn()
461                  * back if record__aio_write() operation failed to start, otherwise
462                  * map->refcount is decremented in record__aio_complete() after
463                  * aio write operation finishes successfully.
464                  */
465                 perf_mmap__put(&map->core);
466         }
467
468         return ret;
469 }
470
471 static off_t record__aio_get_pos(int trace_fd)
472 {
473         return lseek(trace_fd, 0, SEEK_CUR);
474 }
475
476 static void record__aio_set_pos(int trace_fd, off_t pos)
477 {
478         lseek(trace_fd, pos, SEEK_SET);
479 }
480
481 static void record__aio_mmap_read_sync(struct record *rec)
482 {
483         int i;
484         struct evlist *evlist = rec->evlist;
485         struct mmap *maps = evlist->mmap;
486
487         if (!record__aio_enabled(rec))
488                 return;
489
490         for (i = 0; i < evlist->core.nr_mmaps; i++) {
491                 struct mmap *map = &maps[i];
492
493                 if (map->core.base)
494                         record__aio_sync(map, true);
495         }
496 }
497
498 static int nr_cblocks_default = 1;
499 static int nr_cblocks_max = 4;
500
501 static int record__aio_parse(const struct option *opt,
502                              const char *str,
503                              int unset)
504 {
505         struct record_opts *opts = (struct record_opts *)opt->value;
506
507         if (unset) {
508                 opts->nr_cblocks = 0;
509         } else {
510                 if (str)
511                         opts->nr_cblocks = strtol(str, NULL, 0);
512                 if (!opts->nr_cblocks)
513                         opts->nr_cblocks = nr_cblocks_default;
514         }
515
516         return 0;
517 }
518 #else /* HAVE_AIO_SUPPORT */
519 static int nr_cblocks_max = 0;
520
521 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
522                             off_t *off __maybe_unused)
523 {
524         return -1;
525 }
526
527 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
528 {
529         return -1;
530 }
531
532 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
533 {
534 }
535
536 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
537 {
538 }
539 #endif
540
541 static int record__aio_enabled(struct record *rec)
542 {
543         return rec->opts.nr_cblocks > 0;
544 }
545
546 #define MMAP_FLUSH_DEFAULT 1
547 static int record__mmap_flush_parse(const struct option *opt,
548                                     const char *str,
549                                     int unset)
550 {
551         int flush_max;
552         struct record_opts *opts = (struct record_opts *)opt->value;
553         static struct parse_tag tags[] = {
554                         { .tag  = 'B', .mult = 1       },
555                         { .tag  = 'K', .mult = 1 << 10 },
556                         { .tag  = 'M', .mult = 1 << 20 },
557                         { .tag  = 'G', .mult = 1 << 30 },
558                         { .tag  = 0 },
559         };
560
561         if (unset)
562                 return 0;
563
564         if (str) {
565                 opts->mmap_flush = parse_tag_value(str, tags);
566                 if (opts->mmap_flush == (int)-1)
567                         opts->mmap_flush = strtol(str, NULL, 0);
568         }
569
570         if (!opts->mmap_flush)
571                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
572
573         flush_max = evlist__mmap_size(opts->mmap_pages);
574         flush_max /= 4;
575         if (opts->mmap_flush > flush_max)
576                 opts->mmap_flush = flush_max;
577
578         return 0;
579 }
580
581 #ifdef HAVE_ZSTD_SUPPORT
582 static unsigned int comp_level_default = 1;
583
584 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
585 {
586         struct record_opts *opts = opt->value;
587
588         if (unset) {
589                 opts->comp_level = 0;
590         } else {
591                 if (str)
592                         opts->comp_level = strtol(str, NULL, 0);
593                 if (!opts->comp_level)
594                         opts->comp_level = comp_level_default;
595         }
596
597         return 0;
598 }
599 #endif
600 static unsigned int comp_level_max = 22;
601
602 static int record__comp_enabled(struct record *rec)
603 {
604         return rec->opts.comp_level > 0;
605 }
606
607 static int process_synthesized_event(struct perf_tool *tool,
608                                      union perf_event *event,
609                                      struct perf_sample *sample __maybe_unused,
610                                      struct machine *machine __maybe_unused)
611 {
612         struct record *rec = container_of(tool, struct record, tool);
613         return record__write(rec, NULL, event, event->header.size);
614 }
615
616 static struct mutex synth_lock;
617
618 static int process_locked_synthesized_event(struct perf_tool *tool,
619                                      union perf_event *event,
620                                      struct perf_sample *sample __maybe_unused,
621                                      struct machine *machine __maybe_unused)
622 {
623         int ret;
624
625         mutex_lock(&synth_lock);
626         ret = process_synthesized_event(tool, event, sample, machine);
627         mutex_unlock(&synth_lock);
628         return ret;
629 }
630
631 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
632 {
633         struct record *rec = to;
634
635         if (record__comp_enabled(rec)) {
636                 size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
637                 bf   = map->data;
638         }
639
640         thread->samples++;
641         return record__write(rec, map, bf, size);
642 }
643
644 static volatile sig_atomic_t signr = -1;
645 static volatile sig_atomic_t child_finished;
646 #ifdef HAVE_EVENTFD_SUPPORT
647 static volatile sig_atomic_t done_fd = -1;
648 #endif
649
650 static void sig_handler(int sig)
651 {
652         if (sig == SIGCHLD)
653                 child_finished = 1;
654         else
655                 signr = sig;
656
657         done = 1;
658 #ifdef HAVE_EVENTFD_SUPPORT
659         if (done_fd >= 0) {
660                 u64 tmp = 1;
661                 int orig_errno = errno;
662
663                 /*
664                  * It is possible for this signal handler to run after done is
665                  * checked in the main loop, but before the perf counter fds are
666                  * polled. If this happens, the poll() will continue to wait
667                  * even though done is set, and will only break out if either
668                  * another signal is received, or the counters are ready for
669                  * read. To ensure the poll() doesn't sleep when done is set,
670                  * use an eventfd (done_fd) to wake up the poll().
671                  */
672                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
673                         pr_err("failed to signal wakeup fd, error: %m\n");
674
675                 errno = orig_errno;
676         }
677 #endif // HAVE_EVENTFD_SUPPORT
678 }
679
680 static void sigsegv_handler(int sig)
681 {
682         perf_hooks__recover();
683         sighandler_dump_stack(sig);
684 }
685
686 static void record__sig_exit(void)
687 {
688         if (signr == -1)
689                 return;
690
691         signal(signr, SIG_DFL);
692         raise(signr);
693 }
694
695 #ifdef HAVE_AUXTRACE_SUPPORT
696
697 static int record__process_auxtrace(struct perf_tool *tool,
698                                     struct mmap *map,
699                                     union perf_event *event, void *data1,
700                                     size_t len1, void *data2, size_t len2)
701 {
702         struct record *rec = container_of(tool, struct record, tool);
703         struct perf_data *data = &rec->data;
704         size_t padding;
705         u8 pad[8] = {0};
706
707         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
708                 off_t file_offset;
709                 int fd = perf_data__fd(data);
710                 int err;
711
712                 file_offset = lseek(fd, 0, SEEK_CUR);
713                 if (file_offset == -1)
714                         return -1;
715                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
716                                                      event, file_offset);
717                 if (err)
718                         return err;
719         }
720
721         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722         padding = (len1 + len2) & 7;
723         if (padding)
724                 padding = 8 - padding;
725
726         record__write(rec, map, event, event->header.size);
727         record__write(rec, map, data1, len1);
728         if (len2)
729                 record__write(rec, map, data2, len2);
730         record__write(rec, map, &pad, padding);
731
732         return 0;
733 }
734
735 static int record__auxtrace_mmap_read(struct record *rec,
736                                       struct mmap *map)
737 {
738         int ret;
739
740         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741                                   record__process_auxtrace);
742         if (ret < 0)
743                 return ret;
744
745         if (ret)
746                 rec->samples++;
747
748         return 0;
749 }
750
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
752                                                struct mmap *map)
753 {
754         int ret;
755
756         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757                                            record__process_auxtrace,
758                                            rec->opts.auxtrace_snapshot_size);
759         if (ret < 0)
760                 return ret;
761
762         if (ret)
763                 rec->samples++;
764
765         return 0;
766 }
767
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
769 {
770         int i;
771         int rc = 0;
772
773         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774                 struct mmap *map = &rec->evlist->mmap[i];
775
776                 if (!map->auxtrace_mmap.base)
777                         continue;
778
779                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
780                         rc = -1;
781                         goto out;
782                 }
783         }
784 out:
785         return rc;
786 }
787
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
789 {
790         pr_debug("Recording AUX area tracing snapshot\n");
791         if (record__auxtrace_read_snapshot_all(rec) < 0) {
792                 trigger_error(&auxtrace_snapshot_trigger);
793         } else {
794                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795                         trigger_error(&auxtrace_snapshot_trigger);
796                 else
797                         trigger_ready(&auxtrace_snapshot_trigger);
798         }
799 }
800
801 static int record__auxtrace_snapshot_exit(struct record *rec)
802 {
803         if (trigger_is_error(&auxtrace_snapshot_trigger))
804                 return 0;
805
806         if (!auxtrace_record__snapshot_started &&
807             auxtrace_record__snapshot_start(rec->itr))
808                 return -1;
809
810         record__read_auxtrace_snapshot(rec, true);
811         if (trigger_is_error(&auxtrace_snapshot_trigger))
812                 return -1;
813
814         return 0;
815 }
816
817 static int record__auxtrace_init(struct record *rec)
818 {
819         int err;
820
821         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822             && record__threads_enabled(rec)) {
823                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
824                 return -EINVAL;
825         }
826
827         if (!rec->itr) {
828                 rec->itr = auxtrace_record__init(rec->evlist, &err);
829                 if (err)
830                         return err;
831         }
832
833         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834                                               rec->opts.auxtrace_snapshot_opts);
835         if (err)
836                 return err;
837
838         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839                                             rec->opts.auxtrace_sample_opts);
840         if (err)
841                 return err;
842
843         auxtrace_regroup_aux_output(rec->evlist);
844
845         return auxtrace_parse_filters(rec->evlist);
846 }
847
848 #else
849
850 static inline
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852                                struct mmap *map __maybe_unused)
853 {
854         return 0;
855 }
856
857 static inline
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859                                     bool on_exit __maybe_unused)
860 {
861 }
862
863 static inline
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
865 {
866         return 0;
867 }
868
869 static inline
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
871 {
872         return 0;
873 }
874
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
876 {
877         return 0;
878 }
879
880 #endif
881
882 static int record__config_text_poke(struct evlist *evlist)
883 {
884         struct evsel *evsel;
885
886         /* Nothing to do if text poke is already configured */
887         evlist__for_each_entry(evlist, evsel) {
888                 if (evsel->core.attr.text_poke)
889                         return 0;
890         }
891
892         evsel = evlist__add_dummy_on_all_cpus(evlist);
893         if (!evsel)
894                 return -ENOMEM;
895
896         evsel->core.attr.text_poke = 1;
897         evsel->core.attr.ksymbol = 1;
898         evsel->immediate = true;
899         evsel__set_sample_bit(evsel, TIME);
900
901         return 0;
902 }
903
904 static int record__config_off_cpu(struct record *rec)
905 {
906         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
907 }
908
909 static bool record__kcore_readable(struct machine *machine)
910 {
911         char kcore[PATH_MAX];
912         int fd;
913
914         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
915
916         fd = open(kcore, O_RDONLY);
917         if (fd < 0)
918                 return false;
919
920         close(fd);
921
922         return true;
923 }
924
925 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
926 {
927         char from_dir[PATH_MAX];
928         char kcore_dir[PATH_MAX];
929         int ret;
930
931         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
932
933         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
934         if (ret)
935                 return ret;
936
937         return kcore_copy(from_dir, kcore_dir);
938 }
939
940 static void record__thread_data_init_pipes(struct record_thread *thread_data)
941 {
942         thread_data->pipes.msg[0] = -1;
943         thread_data->pipes.msg[1] = -1;
944         thread_data->pipes.ack[0] = -1;
945         thread_data->pipes.ack[1] = -1;
946 }
947
948 static int record__thread_data_open_pipes(struct record_thread *thread_data)
949 {
950         if (pipe(thread_data->pipes.msg))
951                 return -EINVAL;
952
953         if (pipe(thread_data->pipes.ack)) {
954                 close(thread_data->pipes.msg[0]);
955                 thread_data->pipes.msg[0] = -1;
956                 close(thread_data->pipes.msg[1]);
957                 thread_data->pipes.msg[1] = -1;
958                 return -EINVAL;
959         }
960
961         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
962                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
963                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
964
965         return 0;
966 }
967
968 static void record__thread_data_close_pipes(struct record_thread *thread_data)
969 {
970         if (thread_data->pipes.msg[0] != -1) {
971                 close(thread_data->pipes.msg[0]);
972                 thread_data->pipes.msg[0] = -1;
973         }
974         if (thread_data->pipes.msg[1] != -1) {
975                 close(thread_data->pipes.msg[1]);
976                 thread_data->pipes.msg[1] = -1;
977         }
978         if (thread_data->pipes.ack[0] != -1) {
979                 close(thread_data->pipes.ack[0]);
980                 thread_data->pipes.ack[0] = -1;
981         }
982         if (thread_data->pipes.ack[1] != -1) {
983                 close(thread_data->pipes.ack[1]);
984                 thread_data->pipes.ack[1] = -1;
985         }
986 }
987
988 static bool evlist__per_thread(struct evlist *evlist)
989 {
990         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
991 }
992
993 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
994 {
995         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
996         struct mmap *mmap = evlist->mmap;
997         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
998         struct perf_cpu_map *cpus = evlist->core.all_cpus;
999         bool per_thread = evlist__per_thread(evlist);
1000
1001         if (per_thread)
1002                 thread_data->nr_mmaps = nr_mmaps;
1003         else
1004                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1005                                                       thread_data->mask->maps.nbits);
1006         if (mmap) {
1007                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1008                 if (!thread_data->maps)
1009                         return -ENOMEM;
1010         }
1011         if (overwrite_mmap) {
1012                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013                 if (!thread_data->overwrite_maps) {
1014                         zfree(&thread_data->maps);
1015                         return -ENOMEM;
1016                 }
1017         }
1018         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1019                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1020
1021         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1022                 if (per_thread ||
1023                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1024                         if (thread_data->maps) {
1025                                 thread_data->maps[tm] = &mmap[m];
1026                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1027                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1028                         }
1029                         if (thread_data->overwrite_maps) {
1030                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1031                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1032                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033                         }
1034                         tm++;
1035                 }
1036         }
1037
1038         return 0;
1039 }
1040
1041 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1042 {
1043         int f, tm, pos;
1044         struct mmap *map, *overwrite_map;
1045
1046         fdarray__init(&thread_data->pollfd, 64);
1047
1048         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1049                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1050                 overwrite_map = thread_data->overwrite_maps ?
1051                                 thread_data->overwrite_maps[tm] : NULL;
1052
1053                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1054                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1055
1056                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1057                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1058                                                               &evlist->core.pollfd);
1059                                 if (pos < 0)
1060                                         return pos;
1061                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1062                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1063                         }
1064                 }
1065         }
1066
1067         return 0;
1068 }
1069
1070 static void record__free_thread_data(struct record *rec)
1071 {
1072         int t;
1073         struct record_thread *thread_data = rec->thread_data;
1074
1075         if (thread_data == NULL)
1076                 return;
1077
1078         for (t = 0; t < rec->nr_threads; t++) {
1079                 record__thread_data_close_pipes(&thread_data[t]);
1080                 zfree(&thread_data[t].maps);
1081                 zfree(&thread_data[t].overwrite_maps);
1082                 fdarray__exit(&thread_data[t].pollfd);
1083         }
1084
1085         zfree(&rec->thread_data);
1086 }
1087
1088 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1089                                                     int evlist_pollfd_index,
1090                                                     int thread_pollfd_index)
1091 {
1092         size_t x = rec->index_map_cnt;
1093
1094         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1095                 return -ENOMEM;
1096         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1097         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1098         rec->index_map_cnt += 1;
1099         return 0;
1100 }
1101
1102 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1103                                                     struct evlist *evlist,
1104                                                     struct record_thread *thread_data)
1105 {
1106         struct pollfd *e_entries = evlist->core.pollfd.entries;
1107         struct pollfd *t_entries = thread_data->pollfd.entries;
1108         int err = 0;
1109         size_t i;
1110
1111         for (i = 0; i < rec->index_map_cnt; i++) {
1112                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1113                 int t_pos = rec->index_map[i].thread_pollfd_index;
1114
1115                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1116                     e_entries[e_pos].events != t_entries[t_pos].events) {
1117                         pr_err("Thread and evlist pollfd index mismatch\n");
1118                         err = -EINVAL;
1119                         continue;
1120                 }
1121                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1122         }
1123         return err;
1124 }
1125
1126 static int record__dup_non_perf_events(struct record *rec,
1127                                        struct evlist *evlist,
1128                                        struct record_thread *thread_data)
1129 {
1130         struct fdarray *fda = &evlist->core.pollfd;
1131         int i, ret;
1132
1133         for (i = 0; i < fda->nr; i++) {
1134                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1135                         continue;
1136                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1137                 if (ret < 0) {
1138                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1139                         return ret;
1140                 }
1141                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1142                           thread_data, ret, fda->entries[i].fd);
1143                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1144                 if (ret < 0) {
1145                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1146                         return ret;
1147                 }
1148         }
1149         return 0;
1150 }
1151
1152 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1153 {
1154         int t, ret;
1155         struct record_thread *thread_data;
1156
1157         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1158         if (!rec->thread_data) {
1159                 pr_err("Failed to allocate thread data\n");
1160                 return -ENOMEM;
1161         }
1162         thread_data = rec->thread_data;
1163
1164         for (t = 0; t < rec->nr_threads; t++)
1165                 record__thread_data_init_pipes(&thread_data[t]);
1166
1167         for (t = 0; t < rec->nr_threads; t++) {
1168                 thread_data[t].rec = rec;
1169                 thread_data[t].mask = &rec->thread_masks[t];
1170                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1171                 if (ret) {
1172                         pr_err("Failed to initialize thread[%d] maps\n", t);
1173                         goto out_free;
1174                 }
1175                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1176                 if (ret) {
1177                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1178                         goto out_free;
1179                 }
1180                 if (t) {
1181                         thread_data[t].tid = -1;
1182                         ret = record__thread_data_open_pipes(&thread_data[t]);
1183                         if (ret) {
1184                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1185                                 goto out_free;
1186                         }
1187                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1188                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1189                         if (ret < 0) {
1190                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1191                                 goto out_free;
1192                         }
1193                         thread_data[t].ctlfd_pos = ret;
1194                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1195                                  thread_data, thread_data[t].ctlfd_pos,
1196                                  thread_data[t].pipes.msg[0]);
1197                 } else {
1198                         thread_data[t].tid = gettid();
1199
1200                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1201                         if (ret < 0)
1202                                 goto out_free;
1203
1204                         thread_data[t].ctlfd_pos = -1; /* Not used */
1205                 }
1206         }
1207
1208         return 0;
1209
1210 out_free:
1211         record__free_thread_data(rec);
1212
1213         return ret;
1214 }
1215
1216 static int record__mmap_evlist(struct record *rec,
1217                                struct evlist *evlist)
1218 {
1219         int i, ret;
1220         struct record_opts *opts = &rec->opts;
1221         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1222                                   opts->auxtrace_sample_mode;
1223         char msg[512];
1224
1225         if (opts->affinity != PERF_AFFINITY_SYS)
1226                 cpu__setup_cpunode_map();
1227
1228         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1229                                  opts->auxtrace_mmap_pages,
1230                                  auxtrace_overwrite,
1231                                  opts->nr_cblocks, opts->affinity,
1232                                  opts->mmap_flush, opts->comp_level) < 0) {
1233                 if (errno == EPERM) {
1234                         pr_err("Permission error mapping pages.\n"
1235                                "Consider increasing "
1236                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1237                                "or try again with a smaller value of -m/--mmap_pages.\n"
1238                                "(current value: %u,%u)\n",
1239                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1240                         return -errno;
1241                 } else {
1242                         pr_err("failed to mmap with %d (%s)\n", errno,
1243                                 str_error_r(errno, msg, sizeof(msg)));
1244                         if (errno)
1245                                 return -errno;
1246                         else
1247                                 return -EINVAL;
1248                 }
1249         }
1250
1251         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1252                 return -1;
1253
1254         ret = record__alloc_thread_data(rec, evlist);
1255         if (ret)
1256                 return ret;
1257
1258         if (record__threads_enabled(rec)) {
1259                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1260                 if (ret) {
1261                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1262                         return ret;
1263                 }
1264                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1265                         if (evlist->mmap)
1266                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1267                         if (evlist->overwrite_mmap)
1268                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1269                 }
1270         }
1271
1272         return 0;
1273 }
1274
1275 static int record__mmap(struct record *rec)
1276 {
1277         return record__mmap_evlist(rec, rec->evlist);
1278 }
1279
1280 static int record__open(struct record *rec)
1281 {
1282         char msg[BUFSIZ];
1283         struct evsel *pos;
1284         struct evlist *evlist = rec->evlist;
1285         struct perf_session *session = rec->session;
1286         struct record_opts *opts = &rec->opts;
1287         int rc = 0;
1288
1289         /*
1290          * For initial_delay, system wide or a hybrid system, we need to add a
1291          * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1292          * of waiting or event synthesis.
1293          */
1294         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
1295             perf_pmus__num_core_pmus() > 1) {
1296                 pos = evlist__get_tracking_event(evlist);
1297                 if (!evsel__is_dummy_event(pos)) {
1298                         /* Set up dummy event. */
1299                         if (evlist__add_dummy(evlist))
1300                                 return -ENOMEM;
1301                         pos = evlist__last(evlist);
1302                         evlist__set_tracking_event(evlist, pos);
1303                 }
1304
1305                 /*
1306                  * Enable the dummy event when the process is forked for
1307                  * initial_delay, immediately for system wide.
1308                  */
1309                 if (opts->target.initial_delay && !pos->immediate &&
1310                     !target__has_cpu(&opts->target))
1311                         pos->core.attr.enable_on_exec = 1;
1312                 else
1313                         pos->immediate = 1;
1314         }
1315
1316         evlist__config(evlist, opts, &callchain_param);
1317
1318         evlist__for_each_entry(evlist, pos) {
1319 try_again:
1320                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1321                         if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1322                                 if (verbose > 0)
1323                                         ui__warning("%s\n", msg);
1324                                 goto try_again;
1325                         }
1326                         if ((errno == EINVAL || errno == EBADF) &&
1327                             pos->core.leader != &pos->core &&
1328                             pos->weak_group) {
1329                                 pos = evlist__reset_weak_group(evlist, pos, true);
1330                                 goto try_again;
1331                         }
1332                         rc = -errno;
1333                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1334                         ui__error("%s\n", msg);
1335                         goto out;
1336                 }
1337
1338                 pos->supported = true;
1339         }
1340
1341         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1342                 pr_warning(
1343 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1344 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1345 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1346 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1347 "Samples in kernel modules won't be resolved at all.\n\n"
1348 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1349 "even with a suitable vmlinux or kallsyms file.\n\n");
1350         }
1351
1352         if (evlist__apply_filters(evlist, &pos)) {
1353                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1354                         pos->filter ?: "BPF", evsel__name(pos), errno,
1355                         str_error_r(errno, msg, sizeof(msg)));
1356                 rc = -1;
1357                 goto out;
1358         }
1359
1360         rc = record__mmap(rec);
1361         if (rc)
1362                 goto out;
1363
1364         session->evlist = evlist;
1365         perf_session__set_id_hdr_size(session);
1366 out:
1367         return rc;
1368 }
1369
1370 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1371 {
1372         if (rec->evlist->first_sample_time == 0)
1373                 rec->evlist->first_sample_time = sample_time;
1374
1375         if (sample_time)
1376                 rec->evlist->last_sample_time = sample_time;
1377 }
1378
1379 static int process_sample_event(struct perf_tool *tool,
1380                                 union perf_event *event,
1381                                 struct perf_sample *sample,
1382                                 struct evsel *evsel,
1383                                 struct machine *machine)
1384 {
1385         struct record *rec = container_of(tool, struct record, tool);
1386
1387         set_timestamp_boundary(rec, sample->time);
1388
1389         if (rec->buildid_all)
1390                 return 0;
1391
1392         rec->samples++;
1393         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1394 }
1395
1396 static int process_buildids(struct record *rec)
1397 {
1398         struct perf_session *session = rec->session;
1399
1400         if (perf_data__size(&rec->data) == 0)
1401                 return 0;
1402
1403         /*
1404          * During this process, it'll load kernel map and replace the
1405          * dso->long_name to a real pathname it found.  In this case
1406          * we prefer the vmlinux path like
1407          *   /lib/modules/3.16.4/build/vmlinux
1408          *
1409          * rather than build-id path (in debug directory).
1410          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1411          */
1412         symbol_conf.ignore_vmlinux_buildid = true;
1413
1414         /*
1415          * If --buildid-all is given, it marks all DSO regardless of hits,
1416          * so no need to process samples. But if timestamp_boundary is enabled,
1417          * it still needs to walk on all samples to get the timestamps of
1418          * first/last samples.
1419          */
1420         if (rec->buildid_all && !rec->timestamp_boundary)
1421                 rec->tool.sample = NULL;
1422
1423         return perf_session__process_events(session);
1424 }
1425
1426 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1427 {
1428         int err;
1429         struct perf_tool *tool = data;
1430         /*
1431          *As for guest kernel when processing subcommand record&report,
1432          *we arrange module mmap prior to guest kernel mmap and trigger
1433          *a preload dso because default guest module symbols are loaded
1434          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1435          *method is used to avoid symbol missing when the first addr is
1436          *in module instead of in guest kernel.
1437          */
1438         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1439                                              machine);
1440         if (err < 0)
1441                 pr_err("Couldn't record guest kernel [%d]'s reference"
1442                        " relocation symbol.\n", machine->pid);
1443
1444         /*
1445          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1446          * have no _text sometimes.
1447          */
1448         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449                                                  machine);
1450         if (err < 0)
1451                 pr_err("Couldn't record guest kernel [%d]'s reference"
1452                        " relocation symbol.\n", machine->pid);
1453 }
1454
1455 static struct perf_event_header finished_round_event = {
1456         .size = sizeof(struct perf_event_header),
1457         .type = PERF_RECORD_FINISHED_ROUND,
1458 };
1459
1460 static struct perf_event_header finished_init_event = {
1461         .size = sizeof(struct perf_event_header),
1462         .type = PERF_RECORD_FINISHED_INIT,
1463 };
1464
1465 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1466 {
1467         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1468             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1469                           thread->mask->affinity.nbits)) {
1470                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1471                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1472                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1473                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1474                                         (cpu_set_t *)thread->mask->affinity.bits);
1475                 if (verbose == 2) {
1476                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1477                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1478                 }
1479         }
1480 }
1481
1482 static size_t process_comp_header(void *record, size_t increment)
1483 {
1484         struct perf_record_compressed *event = record;
1485         size_t size = sizeof(*event);
1486
1487         if (increment) {
1488                 event->header.size += increment;
1489                 return increment;
1490         }
1491
1492         event->header.type = PERF_RECORD_COMPRESSED;
1493         event->header.size = size;
1494
1495         return size;
1496 }
1497
1498 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1499                             void *dst, size_t dst_size, void *src, size_t src_size)
1500 {
1501         size_t compressed;
1502         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1503         struct zstd_data *zstd_data = &session->zstd_data;
1504
1505         if (map && map->file)
1506                 zstd_data = &map->zstd_data;
1507
1508         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1509                                                      max_record_size, process_comp_header);
1510
1511         if (map && map->file) {
1512                 thread->bytes_transferred += src_size;
1513                 thread->bytes_compressed  += compressed;
1514         } else {
1515                 session->bytes_transferred += src_size;
1516                 session->bytes_compressed  += compressed;
1517         }
1518
1519         return compressed;
1520 }
1521
1522 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1523                                     bool overwrite, bool synch)
1524 {
1525         u64 bytes_written = rec->bytes_written;
1526         int i;
1527         int rc = 0;
1528         int nr_mmaps;
1529         struct mmap **maps;
1530         int trace_fd = rec->data.file.fd;
1531         off_t off = 0;
1532
1533         if (!evlist)
1534                 return 0;
1535
1536         nr_mmaps = thread->nr_mmaps;
1537         maps = overwrite ? thread->overwrite_maps : thread->maps;
1538
1539         if (!maps)
1540                 return 0;
1541
1542         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1543                 return 0;
1544
1545         if (record__aio_enabled(rec))
1546                 off = record__aio_get_pos(trace_fd);
1547
1548         for (i = 0; i < nr_mmaps; i++) {
1549                 u64 flush = 0;
1550                 struct mmap *map = maps[i];
1551
1552                 if (map->core.base) {
1553                         record__adjust_affinity(rec, map);
1554                         if (synch) {
1555                                 flush = map->core.flush;
1556                                 map->core.flush = 1;
1557                         }
1558                         if (!record__aio_enabled(rec)) {
1559                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1560                                         if (synch)
1561                                                 map->core.flush = flush;
1562                                         rc = -1;
1563                                         goto out;
1564                                 }
1565                         } else {
1566                                 if (record__aio_push(rec, map, &off) < 0) {
1567                                         record__aio_set_pos(trace_fd, off);
1568                                         if (synch)
1569                                                 map->core.flush = flush;
1570                                         rc = -1;
1571                                         goto out;
1572                                 }
1573                         }
1574                         if (synch)
1575                                 map->core.flush = flush;
1576                 }
1577
1578                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1579                     !rec->opts.auxtrace_sample_mode &&
1580                     record__auxtrace_mmap_read(rec, map) != 0) {
1581                         rc = -1;
1582                         goto out;
1583                 }
1584         }
1585
1586         if (record__aio_enabled(rec))
1587                 record__aio_set_pos(trace_fd, off);
1588
1589         /*
1590          * Mark the round finished in case we wrote
1591          * at least one event.
1592          *
1593          * No need for round events in directory mode,
1594          * because per-cpu maps and files have data
1595          * sorted by kernel.
1596          */
1597         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1598                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1599
1600         if (overwrite)
1601                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1602 out:
1603         return rc;
1604 }
1605
1606 static int record__mmap_read_all(struct record *rec, bool synch)
1607 {
1608         int err;
1609
1610         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1611         if (err)
1612                 return err;
1613
1614         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1615 }
1616
1617 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1618                                            void *arg __maybe_unused)
1619 {
1620         struct perf_mmap *map = fda->priv[fd].ptr;
1621
1622         if (map)
1623                 perf_mmap__put(map);
1624 }
1625
1626 static void *record__thread(void *arg)
1627 {
1628         enum thread_msg msg = THREAD_MSG__READY;
1629         bool terminate = false;
1630         struct fdarray *pollfd;
1631         int err, ctlfd_pos;
1632
1633         thread = arg;
1634         thread->tid = gettid();
1635
1636         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1637         if (err == -1)
1638                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1639                            thread->tid, strerror(errno));
1640
1641         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1642
1643         pollfd = &thread->pollfd;
1644         ctlfd_pos = thread->ctlfd_pos;
1645
1646         for (;;) {
1647                 unsigned long long hits = thread->samples;
1648
1649                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1650                         break;
1651
1652                 if (hits == thread->samples) {
1653
1654                         err = fdarray__poll(pollfd, -1);
1655                         /*
1656                          * Propagate error, only if there's any. Ignore positive
1657                          * number of returned events and interrupt error.
1658                          */
1659                         if (err > 0 || (err < 0 && errno == EINTR))
1660                                 err = 0;
1661                         thread->waking++;
1662
1663                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1664                                             record__thread_munmap_filtered, NULL) == 0)
1665                                 break;
1666                 }
1667
1668                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1669                         terminate = true;
1670                         close(thread->pipes.msg[0]);
1671                         thread->pipes.msg[0] = -1;
1672                         pollfd->entries[ctlfd_pos].fd = -1;
1673                         pollfd->entries[ctlfd_pos].events = 0;
1674                 }
1675
1676                 pollfd->entries[ctlfd_pos].revents = 0;
1677         }
1678         record__mmap_read_all(thread->rec, true);
1679
1680         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681         if (err == -1)
1682                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1683                            thread->tid, strerror(errno));
1684
1685         return NULL;
1686 }
1687
1688 static void record__init_features(struct record *rec)
1689 {
1690         struct perf_session *session = rec->session;
1691         int feat;
1692
1693         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1694                 perf_header__set_feat(&session->header, feat);
1695
1696         if (rec->no_buildid)
1697                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1698
1699 #ifdef HAVE_LIBTRACEEVENT
1700         if (!have_tracepoints(&rec->evlist->core.entries))
1701                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1702 #endif
1703
1704         if (!rec->opts.branch_stack)
1705                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1706
1707         if (!rec->opts.full_auxtrace)
1708                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1709
1710         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1711                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1712
1713         if (!rec->opts.use_clockid)
1714                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1715
1716         if (!record__threads_enabled(rec))
1717                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1718
1719         if (!record__comp_enabled(rec))
1720                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1721
1722         perf_header__clear_feat(&session->header, HEADER_STAT);
1723 }
1724
1725 static void
1726 record__finish_output(struct record *rec)
1727 {
1728         int i;
1729         struct perf_data *data = &rec->data;
1730         int fd = perf_data__fd(data);
1731
1732         if (data->is_pipe)
1733                 return;
1734
1735         rec->session->header.data_size += rec->bytes_written;
1736         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1737         if (record__threads_enabled(rec)) {
1738                 for (i = 0; i < data->dir.nr; i++)
1739                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1740         }
1741
1742         if (!rec->no_buildid) {
1743                 process_buildids(rec);
1744
1745                 if (rec->buildid_all)
1746                         dsos__hit_all(rec->session);
1747         }
1748         perf_session__write_header(rec->session, rec->evlist, fd, true);
1749
1750         return;
1751 }
1752
1753 static int record__synthesize_workload(struct record *rec, bool tail)
1754 {
1755         int err;
1756         struct perf_thread_map *thread_map;
1757         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1758
1759         if (rec->opts.tail_synthesize != tail)
1760                 return 0;
1761
1762         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1763         if (thread_map == NULL)
1764                 return -1;
1765
1766         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1767                                                  process_synthesized_event,
1768                                                  &rec->session->machines.host,
1769                                                  needs_mmap,
1770                                                  rec->opts.sample_address);
1771         perf_thread_map__put(thread_map);
1772         return err;
1773 }
1774
1775 static int write_finished_init(struct record *rec, bool tail)
1776 {
1777         if (rec->opts.tail_synthesize != tail)
1778                 return 0;
1779
1780         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1781 }
1782
1783 static int record__synthesize(struct record *rec, bool tail);
1784
1785 static int
1786 record__switch_output(struct record *rec, bool at_exit)
1787 {
1788         struct perf_data *data = &rec->data;
1789         int fd, err;
1790         char *new_filename;
1791
1792         /* Same Size:      "2015122520103046"*/
1793         char timestamp[] = "InvalidTimestamp";
1794
1795         record__aio_mmap_read_sync(rec);
1796
1797         write_finished_init(rec, true);
1798
1799         record__synthesize(rec, true);
1800         if (target__none(&rec->opts.target))
1801                 record__synthesize_workload(rec, true);
1802
1803         rec->samples = 0;
1804         record__finish_output(rec);
1805         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1806         if (err) {
1807                 pr_err("Failed to get current timestamp\n");
1808                 return -EINVAL;
1809         }
1810
1811         fd = perf_data__switch(data, timestamp,
1812                                     rec->session->header.data_offset,
1813                                     at_exit, &new_filename);
1814         if (fd >= 0 && !at_exit) {
1815                 rec->bytes_written = 0;
1816                 rec->session->header.data_size = 0;
1817         }
1818
1819         if (!quiet)
1820                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1821                         data->path, timestamp);
1822
1823         if (rec->switch_output.num_files) {
1824                 int n = rec->switch_output.cur_file + 1;
1825
1826                 if (n >= rec->switch_output.num_files)
1827                         n = 0;
1828                 rec->switch_output.cur_file = n;
1829                 if (rec->switch_output.filenames[n]) {
1830                         remove(rec->switch_output.filenames[n]);
1831                         zfree(&rec->switch_output.filenames[n]);
1832                 }
1833                 rec->switch_output.filenames[n] = new_filename;
1834         } else {
1835                 free(new_filename);
1836         }
1837
1838         /* Output tracking events */
1839         if (!at_exit) {
1840                 record__synthesize(rec, false);
1841
1842                 /*
1843                  * In 'perf record --switch-output' without -a,
1844                  * record__synthesize() in record__switch_output() won't
1845                  * generate tracking events because there's no thread_map
1846                  * in evlist. Which causes newly created perf.data doesn't
1847                  * contain map and comm information.
1848                  * Create a fake thread_map and directly call
1849                  * perf_event__synthesize_thread_map() for those events.
1850                  */
1851                 if (target__none(&rec->opts.target))
1852                         record__synthesize_workload(rec, false);
1853                 write_finished_init(rec, false);
1854         }
1855         return fd;
1856 }
1857
1858 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1859                                         struct perf_record_lost_samples *lost,
1860                                         int cpu_idx, int thread_idx, u64 lost_count,
1861                                         u16 misc_flag)
1862 {
1863         struct perf_sample_id *sid;
1864         struct perf_sample sample = {};
1865         int id_hdr_size;
1866
1867         lost->lost = lost_count;
1868         if (evsel->core.ids) {
1869                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1870                 sample.id = sid->id;
1871         }
1872
1873         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1874                                                        evsel->core.attr.sample_type, &sample);
1875         lost->header.size = sizeof(*lost) + id_hdr_size;
1876         lost->header.misc = misc_flag;
1877         record__write(rec, NULL, lost, lost->header.size);
1878 }
1879
1880 static void record__read_lost_samples(struct record *rec)
1881 {
1882         struct perf_session *session = rec->session;
1883         struct perf_record_lost_samples *lost;
1884         struct evsel *evsel;
1885
1886         /* there was an error during record__open */
1887         if (session->evlist == NULL)
1888                 return;
1889
1890         lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1891         if (lost == NULL) {
1892                 pr_debug("Memory allocation failed\n");
1893                 return;
1894         }
1895
1896         lost->header.type = PERF_RECORD_LOST_SAMPLES;
1897
1898         evlist__for_each_entry(session->evlist, evsel) {
1899                 struct xyarray *xy = evsel->core.sample_id;
1900                 u64 lost_count;
1901
1902                 if (xy == NULL || evsel->core.fd == NULL)
1903                         continue;
1904                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1905                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1906                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1907                         continue;
1908                 }
1909
1910                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1911                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1912                                 struct perf_counts_values count;
1913
1914                                 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1915                                         pr_debug("read LOST count failed\n");
1916                                         goto out;
1917                                 }
1918
1919                                 if (count.lost) {
1920                                         __record__save_lost_samples(rec, evsel, lost,
1921                                                                     x, y, count.lost, 0);
1922                                 }
1923                         }
1924                 }
1925
1926                 lost_count = perf_bpf_filter__lost_count(evsel);
1927                 if (lost_count)
1928                         __record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1929                                                     PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1930         }
1931 out:
1932         free(lost);
1933 }
1934
1935 static volatile sig_atomic_t workload_exec_errno;
1936
1937 /*
1938  * evlist__prepare_workload will send a SIGUSR1
1939  * if the fork fails, since we asked by setting its
1940  * want_signal to true.
1941  */
1942 static void workload_exec_failed_signal(int signo __maybe_unused,
1943                                         siginfo_t *info,
1944                                         void *ucontext __maybe_unused)
1945 {
1946         workload_exec_errno = info->si_value.sival_int;
1947         done = 1;
1948         child_finished = 1;
1949 }
1950
1951 static void snapshot_sig_handler(int sig);
1952 static void alarm_sig_handler(int sig);
1953
1954 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1955 {
1956         if (evlist) {
1957                 if (evlist->mmap && evlist->mmap[0].core.base)
1958                         return evlist->mmap[0].core.base;
1959                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1960                         return evlist->overwrite_mmap[0].core.base;
1961         }
1962         return NULL;
1963 }
1964
1965 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1966 {
1967         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1968         if (pc)
1969                 return pc;
1970         return NULL;
1971 }
1972
1973 static int record__synthesize(struct record *rec, bool tail)
1974 {
1975         struct perf_session *session = rec->session;
1976         struct machine *machine = &session->machines.host;
1977         struct perf_data *data = &rec->data;
1978         struct record_opts *opts = &rec->opts;
1979         struct perf_tool *tool = &rec->tool;
1980         int err = 0;
1981         event_op f = process_synthesized_event;
1982
1983         if (rec->opts.tail_synthesize != tail)
1984                 return 0;
1985
1986         if (data->is_pipe) {
1987                 err = perf_event__synthesize_for_pipe(tool, session, data,
1988                                                       process_synthesized_event);
1989                 if (err < 0)
1990                         goto out;
1991
1992                 rec->bytes_written += err;
1993         }
1994
1995         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1996                                           process_synthesized_event, machine);
1997         if (err)
1998                 goto out;
1999
2000         /* Synthesize id_index before auxtrace_info */
2001         err = perf_event__synthesize_id_index(tool,
2002                                               process_synthesized_event,
2003                                               session->evlist, machine);
2004         if (err)
2005                 goto out;
2006
2007         if (rec->opts.full_auxtrace) {
2008                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2009                                         session, process_synthesized_event);
2010                 if (err)
2011                         goto out;
2012         }
2013
2014         if (!evlist__exclude_kernel(rec->evlist)) {
2015                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2016                                                          machine);
2017                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2018                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019                                    "Check /proc/kallsyms permission or run as root.\n");
2020
2021                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2022                                                      machine);
2023                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2024                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2025                                    "Check /proc/modules permission or run as root.\n");
2026         }
2027
2028         if (perf_guest) {
2029                 machines__process_guests(&session->machines,
2030                                          perf_event__synthesize_guest_os, tool);
2031         }
2032
2033         err = perf_event__synthesize_extra_attr(&rec->tool,
2034                                                 rec->evlist,
2035                                                 process_synthesized_event,
2036                                                 data->is_pipe);
2037         if (err)
2038                 goto out;
2039
2040         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2041                                                  process_synthesized_event,
2042                                                 NULL);
2043         if (err < 0) {
2044                 pr_err("Couldn't synthesize thread map.\n");
2045                 return err;
2046         }
2047
2048         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2049                                              process_synthesized_event, NULL);
2050         if (err < 0) {
2051                 pr_err("Couldn't synthesize cpu map.\n");
2052                 return err;
2053         }
2054
2055         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2056                                                 machine, opts);
2057         if (err < 0) {
2058                 pr_warning("Couldn't synthesize bpf events.\n");
2059                 err = 0;
2060         }
2061
2062         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2063                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2064                                                      machine);
2065                 if (err < 0) {
2066                         pr_warning("Couldn't synthesize cgroup events.\n");
2067                         err = 0;
2068                 }
2069         }
2070
2071         if (rec->opts.nr_threads_synthesize > 1) {
2072                 mutex_init(&synth_lock);
2073                 perf_set_multithreaded();
2074                 f = process_locked_synthesized_event;
2075         }
2076
2077         if (rec->opts.synth & PERF_SYNTH_TASK) {
2078                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2079
2080                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2081                                                     rec->evlist->core.threads,
2082                                                     f, needs_mmap, opts->sample_address,
2083                                                     rec->opts.nr_threads_synthesize);
2084         }
2085
2086         if (rec->opts.nr_threads_synthesize > 1) {
2087                 perf_set_singlethreaded();
2088                 mutex_destroy(&synth_lock);
2089         }
2090
2091 out:
2092         return err;
2093 }
2094
2095 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2096 {
2097         struct record *rec = data;
2098         pthread_kill(rec->thread_id, SIGUSR2);
2099         return 0;
2100 }
2101
2102 static int record__setup_sb_evlist(struct record *rec)
2103 {
2104         struct record_opts *opts = &rec->opts;
2105
2106         if (rec->sb_evlist != NULL) {
2107                 /*
2108                  * We get here if --switch-output-event populated the
2109                  * sb_evlist, so associate a callback that will send a SIGUSR2
2110                  * to the main thread.
2111                  */
2112                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2113                 rec->thread_id = pthread_self();
2114         }
2115 #ifdef HAVE_LIBBPF_SUPPORT
2116         if (!opts->no_bpf_event) {
2117                 if (rec->sb_evlist == NULL) {
2118                         rec->sb_evlist = evlist__new();
2119
2120                         if (rec->sb_evlist == NULL) {
2121                                 pr_err("Couldn't create side band evlist.\n.");
2122                                 return -1;
2123                         }
2124                 }
2125
2126                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2127                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2128                         return -1;
2129                 }
2130         }
2131 #endif
2132         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2133                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2134                 opts->no_bpf_event = true;
2135         }
2136
2137         return 0;
2138 }
2139
2140 static int record__init_clock(struct record *rec)
2141 {
2142         struct perf_session *session = rec->session;
2143         struct timespec ref_clockid;
2144         struct timeval ref_tod;
2145         u64 ref;
2146
2147         if (!rec->opts.use_clockid)
2148                 return 0;
2149
2150         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2151                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2152
2153         session->header.env.clock.clockid = rec->opts.clockid;
2154
2155         if (gettimeofday(&ref_tod, NULL) != 0) {
2156                 pr_err("gettimeofday failed, cannot set reference time.\n");
2157                 return -1;
2158         }
2159
2160         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2161                 pr_err("clock_gettime failed, cannot set reference time.\n");
2162                 return -1;
2163         }
2164
2165         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2166               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2167
2168         session->header.env.clock.tod_ns = ref;
2169
2170         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2171               (u64) ref_clockid.tv_nsec;
2172
2173         session->header.env.clock.clockid_ns = ref;
2174         return 0;
2175 }
2176
2177 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2178 {
2179         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2180                 trigger_hit(&auxtrace_snapshot_trigger);
2181                 auxtrace_record__snapshot_started = 1;
2182                 if (auxtrace_record__snapshot_start(rec->itr))
2183                         trigger_error(&auxtrace_snapshot_trigger);
2184         }
2185 }
2186
2187 static void record__uniquify_name(struct record *rec)
2188 {
2189         struct evsel *pos;
2190         struct evlist *evlist = rec->evlist;
2191         char *new_name;
2192         int ret;
2193
2194         if (perf_pmus__num_core_pmus() == 1)
2195                 return;
2196
2197         evlist__for_each_entry(evlist, pos) {
2198                 if (!evsel__is_hybrid(pos))
2199                         continue;
2200
2201                 if (strchr(pos->name, '/'))
2202                         continue;
2203
2204                 ret = asprintf(&new_name, "%s/%s/",
2205                                pos->pmu_name, pos->name);
2206                 if (ret) {
2207                         free(pos->name);
2208                         pos->name = new_name;
2209                 }
2210         }
2211 }
2212
2213 static int record__terminate_thread(struct record_thread *thread_data)
2214 {
2215         int err;
2216         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2217         pid_t tid = thread_data->tid;
2218
2219         close(thread_data->pipes.msg[1]);
2220         thread_data->pipes.msg[1] = -1;
2221         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2222         if (err > 0)
2223                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2224         else
2225                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2226                            thread->tid, tid);
2227
2228         return 0;
2229 }
2230
2231 static int record__start_threads(struct record *rec)
2232 {
2233         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2234         struct record_thread *thread_data = rec->thread_data;
2235         sigset_t full, mask;
2236         pthread_t handle;
2237         pthread_attr_t attrs;
2238
2239         thread = &thread_data[0];
2240
2241         if (!record__threads_enabled(rec))
2242                 return 0;
2243
2244         sigfillset(&full);
2245         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2246                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2247                 return -1;
2248         }
2249
2250         pthread_attr_init(&attrs);
2251         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2252
2253         for (t = 1; t < nr_threads; t++) {
2254                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2255
2256 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2257                 pthread_attr_setaffinity_np(&attrs,
2258                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2259                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2260 #endif
2261                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2262                         for (tt = 1; tt < t; tt++)
2263                                 record__terminate_thread(&thread_data[t]);
2264                         pr_err("Failed to start threads: %s\n", strerror(errno));
2265                         ret = -1;
2266                         goto out_err;
2267                 }
2268
2269                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2270                 if (err > 0)
2271                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2272                                   thread_msg_tags[msg]);
2273                 else
2274                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2275                                    thread->tid, rec->thread_data[t].tid);
2276         }
2277
2278         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2279                         (cpu_set_t *)thread->mask->affinity.bits);
2280
2281         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2282
2283 out_err:
2284         pthread_attr_destroy(&attrs);
2285
2286         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2287                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2288                 ret = -1;
2289         }
2290
2291         return ret;
2292 }
2293
2294 static int record__stop_threads(struct record *rec)
2295 {
2296         int t;
2297         struct record_thread *thread_data = rec->thread_data;
2298
2299         for (t = 1; t < rec->nr_threads; t++)
2300                 record__terminate_thread(&thread_data[t]);
2301
2302         for (t = 0; t < rec->nr_threads; t++) {
2303                 rec->samples += thread_data[t].samples;
2304                 if (!record__threads_enabled(rec))
2305                         continue;
2306                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2307                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2308                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2309                          thread_data[t].samples, thread_data[t].waking);
2310                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2311                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2312                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2313                 else
2314                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2315         }
2316
2317         return 0;
2318 }
2319
2320 static unsigned long record__waking(struct record *rec)
2321 {
2322         int t;
2323         unsigned long waking = 0;
2324         struct record_thread *thread_data = rec->thread_data;
2325
2326         for (t = 0; t < rec->nr_threads; t++)
2327                 waking += thread_data[t].waking;
2328
2329         return waking;
2330 }
2331
2332 static int __cmd_record(struct record *rec, int argc, const char **argv)
2333 {
2334         int err;
2335         int status = 0;
2336         const bool forks = argc > 0;
2337         struct perf_tool *tool = &rec->tool;
2338         struct record_opts *opts = &rec->opts;
2339         struct perf_data *data = &rec->data;
2340         struct perf_session *session;
2341         bool disabled = false, draining = false;
2342         int fd;
2343         float ratio = 0;
2344         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2345
2346         atexit(record__sig_exit);
2347         signal(SIGCHLD, sig_handler);
2348         signal(SIGINT, sig_handler);
2349         signal(SIGTERM, sig_handler);
2350         signal(SIGSEGV, sigsegv_handler);
2351
2352         if (rec->opts.record_namespaces)
2353                 tool->namespace_events = true;
2354
2355         if (rec->opts.record_cgroup) {
2356 #ifdef HAVE_FILE_HANDLE
2357                 tool->cgroup_events = true;
2358 #else
2359                 pr_err("cgroup tracking is not supported\n");
2360                 return -1;
2361 #endif
2362         }
2363
2364         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2365                 signal(SIGUSR2, snapshot_sig_handler);
2366                 if (rec->opts.auxtrace_snapshot_mode)
2367                         trigger_on(&auxtrace_snapshot_trigger);
2368                 if (rec->switch_output.enabled)
2369                         trigger_on(&switch_output_trigger);
2370         } else {
2371                 signal(SIGUSR2, SIG_IGN);
2372         }
2373
2374         session = perf_session__new(data, tool);
2375         if (IS_ERR(session)) {
2376                 pr_err("Perf session creation failed.\n");
2377                 return PTR_ERR(session);
2378         }
2379
2380         if (record__threads_enabled(rec)) {
2381                 if (perf_data__is_pipe(&rec->data)) {
2382                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2383                         return -1;
2384                 }
2385                 if (rec->opts.full_auxtrace) {
2386                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2387                         return -1;
2388                 }
2389         }
2390
2391         fd = perf_data__fd(data);
2392         rec->session = session;
2393
2394         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2395                 pr_err("Compression initialization failed.\n");
2396                 return -1;
2397         }
2398 #ifdef HAVE_EVENTFD_SUPPORT
2399         done_fd = eventfd(0, EFD_NONBLOCK);
2400         if (done_fd < 0) {
2401                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2402                 status = -1;
2403                 goto out_delete_session;
2404         }
2405         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2406         if (err < 0) {
2407                 pr_err("Failed to add wakeup eventfd to poll list\n");
2408                 status = err;
2409                 goto out_delete_session;
2410         }
2411 #endif // HAVE_EVENTFD_SUPPORT
2412
2413         session->header.env.comp_type  = PERF_COMP_ZSTD;
2414         session->header.env.comp_level = rec->opts.comp_level;
2415
2416         if (rec->opts.kcore &&
2417             !record__kcore_readable(&session->machines.host)) {
2418                 pr_err("ERROR: kcore is not readable.\n");
2419                 return -1;
2420         }
2421
2422         if (record__init_clock(rec))
2423                 return -1;
2424
2425         record__init_features(rec);
2426
2427         if (forks) {
2428                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2429                                                workload_exec_failed_signal);
2430                 if (err < 0) {
2431                         pr_err("Couldn't run the workload!\n");
2432                         status = err;
2433                         goto out_delete_session;
2434                 }
2435         }
2436
2437         /*
2438          * If we have just single event and are sending data
2439          * through pipe, we need to force the ids allocation,
2440          * because we synthesize event name through the pipe
2441          * and need the id for that.
2442          */
2443         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2444                 rec->opts.sample_id = true;
2445
2446         record__uniquify_name(rec);
2447
2448         /* Debug message used by test scripts */
2449         pr_debug3("perf record opening and mmapping events\n");
2450         if (record__open(rec) != 0) {
2451                 err = -1;
2452                 goto out_free_threads;
2453         }
2454         /* Debug message used by test scripts */
2455         pr_debug3("perf record done opening and mmapping events\n");
2456         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2457
2458         if (rec->opts.kcore) {
2459                 err = record__kcore_copy(&session->machines.host, data);
2460                 if (err) {
2461                         pr_err("ERROR: Failed to copy kcore\n");
2462                         goto out_free_threads;
2463                 }
2464         }
2465
2466         /*
2467          * Normally perf_session__new would do this, but it doesn't have the
2468          * evlist.
2469          */
2470         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2471                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2472                 rec->tool.ordered_events = false;
2473         }
2474
2475         if (evlist__nr_groups(rec->evlist) == 0)
2476                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2477
2478         if (data->is_pipe) {
2479                 err = perf_header__write_pipe(fd);
2480                 if (err < 0)
2481                         goto out_free_threads;
2482         } else {
2483                 err = perf_session__write_header(session, rec->evlist, fd, false);
2484                 if (err < 0)
2485                         goto out_free_threads;
2486         }
2487
2488         err = -1;
2489         if (!rec->no_buildid
2490             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2491                 pr_err("Couldn't generate buildids. "
2492                        "Use --no-buildid to profile anyway.\n");
2493                 goto out_free_threads;
2494         }
2495
2496         err = record__setup_sb_evlist(rec);
2497         if (err)
2498                 goto out_free_threads;
2499
2500         err = record__synthesize(rec, false);
2501         if (err < 0)
2502                 goto out_free_threads;
2503
2504         if (rec->realtime_prio) {
2505                 struct sched_param param;
2506
2507                 param.sched_priority = rec->realtime_prio;
2508                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2509                         pr_err("Could not set realtime priority.\n");
2510                         err = -1;
2511                         goto out_free_threads;
2512                 }
2513         }
2514
2515         if (record__start_threads(rec))
2516                 goto out_free_threads;
2517
2518         /*
2519          * When perf is starting the traced process, all the events
2520          * (apart from group members) have enable_on_exec=1 set,
2521          * so don't spoil it by prematurely enabling them.
2522          */
2523         if (!target__none(&opts->target) && !opts->target.initial_delay)
2524                 evlist__enable(rec->evlist);
2525
2526         /*
2527          * Let the child rip
2528          */
2529         if (forks) {
2530                 struct machine *machine = &session->machines.host;
2531                 union perf_event *event;
2532                 pid_t tgid;
2533
2534                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2535                 if (event == NULL) {
2536                         err = -ENOMEM;
2537                         goto out_child;
2538                 }
2539
2540                 /*
2541                  * Some H/W events are generated before COMM event
2542                  * which is emitted during exec(), so perf script
2543                  * cannot see a correct process name for those events.
2544                  * Synthesize COMM event to prevent it.
2545                  */
2546                 tgid = perf_event__synthesize_comm(tool, event,
2547                                                    rec->evlist->workload.pid,
2548                                                    process_synthesized_event,
2549                                                    machine);
2550                 free(event);
2551
2552                 if (tgid == -1)
2553                         goto out_child;
2554
2555                 event = malloc(sizeof(event->namespaces) +
2556                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2557                                machine->id_hdr_size);
2558                 if (event == NULL) {
2559                         err = -ENOMEM;
2560                         goto out_child;
2561                 }
2562
2563                 /*
2564                  * Synthesize NAMESPACES event for the command specified.
2565                  */
2566                 perf_event__synthesize_namespaces(tool, event,
2567                                                   rec->evlist->workload.pid,
2568                                                   tgid, process_synthesized_event,
2569                                                   machine);
2570                 free(event);
2571
2572                 evlist__start_workload(rec->evlist);
2573         }
2574
2575         if (opts->target.initial_delay) {
2576                 pr_info(EVLIST_DISABLED_MSG);
2577                 if (opts->target.initial_delay > 0) {
2578                         usleep(opts->target.initial_delay * USEC_PER_MSEC);
2579                         evlist__enable(rec->evlist);
2580                         pr_info(EVLIST_ENABLED_MSG);
2581                 }
2582         }
2583
2584         err = event_enable_timer__start(rec->evlist->eet);
2585         if (err)
2586                 goto out_child;
2587
2588         /* Debug message used by test scripts */
2589         pr_debug3("perf record has started\n");
2590         fflush(stderr);
2591
2592         trigger_ready(&auxtrace_snapshot_trigger);
2593         trigger_ready(&switch_output_trigger);
2594         perf_hooks__invoke_record_start();
2595
2596         /*
2597          * Must write FINISHED_INIT so it will be seen after all other
2598          * synthesized user events, but before any regular events.
2599          */
2600         err = write_finished_init(rec, false);
2601         if (err < 0)
2602                 goto out_child;
2603
2604         for (;;) {
2605                 unsigned long long hits = thread->samples;
2606
2607                 /*
2608                  * rec->evlist->bkw_mmap_state is possible to be
2609                  * BKW_MMAP_EMPTY here: when done == true and
2610                  * hits != rec->samples in previous round.
2611                  *
2612                  * evlist__toggle_bkw_mmap ensure we never
2613                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2614                  */
2615                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2616                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2617
2618                 if (record__mmap_read_all(rec, false) < 0) {
2619                         trigger_error(&auxtrace_snapshot_trigger);
2620                         trigger_error(&switch_output_trigger);
2621                         err = -1;
2622                         goto out_child;
2623                 }
2624
2625                 if (auxtrace_record__snapshot_started) {
2626                         auxtrace_record__snapshot_started = 0;
2627                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2628                                 record__read_auxtrace_snapshot(rec, false);
2629                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2630                                 pr_err("AUX area tracing snapshot failed\n");
2631                                 err = -1;
2632                                 goto out_child;
2633                         }
2634                 }
2635
2636                 if (trigger_is_hit(&switch_output_trigger)) {
2637                         /*
2638                          * If switch_output_trigger is hit, the data in
2639                          * overwritable ring buffer should have been collected,
2640                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2641                          *
2642                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2643                          * record__mmap_read_all() didn't collect data from
2644                          * overwritable ring buffer. Read again.
2645                          */
2646                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2647                                 continue;
2648                         trigger_ready(&switch_output_trigger);
2649
2650                         /*
2651                          * Reenable events in overwrite ring buffer after
2652                          * record__mmap_read_all(): we should have collected
2653                          * data from it.
2654                          */
2655                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2656
2657                         if (!quiet)
2658                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2659                                         record__waking(rec));
2660                         thread->waking = 0;
2661                         fd = record__switch_output(rec, false);
2662                         if (fd < 0) {
2663                                 pr_err("Failed to switch to new file\n");
2664                                 trigger_error(&switch_output_trigger);
2665                                 err = fd;
2666                                 goto out_child;
2667                         }
2668
2669                         /* re-arm the alarm */
2670                         if (rec->switch_output.time)
2671                                 alarm(rec->switch_output.time);
2672                 }
2673
2674                 if (hits == thread->samples) {
2675                         if (done || draining)
2676                                 break;
2677                         err = fdarray__poll(&thread->pollfd, -1);
2678                         /*
2679                          * Propagate error, only if there's any. Ignore positive
2680                          * number of returned events and interrupt error.
2681                          */
2682                         if (err > 0 || (err < 0 && errno == EINTR))
2683                                 err = 0;
2684                         thread->waking++;
2685
2686                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2687                                             record__thread_munmap_filtered, NULL) == 0)
2688                                 draining = true;
2689
2690                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2691                         if (err)
2692                                 goto out_child;
2693                 }
2694
2695                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2696                         switch (cmd) {
2697                         case EVLIST_CTL_CMD_SNAPSHOT:
2698                                 hit_auxtrace_snapshot_trigger(rec);
2699                                 evlist__ctlfd_ack(rec->evlist);
2700                                 break;
2701                         case EVLIST_CTL_CMD_STOP:
2702                                 done = 1;
2703                                 break;
2704                         case EVLIST_CTL_CMD_ACK:
2705                         case EVLIST_CTL_CMD_UNSUPPORTED:
2706                         case EVLIST_CTL_CMD_ENABLE:
2707                         case EVLIST_CTL_CMD_DISABLE:
2708                         case EVLIST_CTL_CMD_EVLIST:
2709                         case EVLIST_CTL_CMD_PING:
2710                         default:
2711                                 break;
2712                         }
2713                 }
2714
2715                 err = event_enable_timer__process(rec->evlist->eet);
2716                 if (err < 0)
2717                         goto out_child;
2718                 if (err) {
2719                         err = 0;
2720                         done = 1;
2721                 }
2722
2723                 /*
2724                  * When perf is starting the traced process, at the end events
2725                  * die with the process and we wait for that. Thus no need to
2726                  * disable events in this case.
2727                  */
2728                 if (done && !disabled && !target__none(&opts->target)) {
2729                         trigger_off(&auxtrace_snapshot_trigger);
2730                         evlist__disable(rec->evlist);
2731                         disabled = true;
2732                 }
2733         }
2734
2735         trigger_off(&auxtrace_snapshot_trigger);
2736         trigger_off(&switch_output_trigger);
2737
2738         if (opts->auxtrace_snapshot_on_exit)
2739                 record__auxtrace_snapshot_exit(rec);
2740
2741         if (forks && workload_exec_errno) {
2742                 char msg[STRERR_BUFSIZE], strevsels[2048];
2743                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2744
2745                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2746
2747                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2748                         strevsels, argv[0], emsg);
2749                 err = -1;
2750                 goto out_child;
2751         }
2752
2753         if (!quiet)
2754                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2755                         record__waking(rec));
2756
2757         write_finished_init(rec, true);
2758
2759         if (target__none(&rec->opts.target))
2760                 record__synthesize_workload(rec, true);
2761
2762 out_child:
2763         record__stop_threads(rec);
2764         record__mmap_read_all(rec, true);
2765 out_free_threads:
2766         record__free_thread_data(rec);
2767         evlist__finalize_ctlfd(rec->evlist);
2768         record__aio_mmap_read_sync(rec);
2769
2770         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2771                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2772                 session->header.env.comp_ratio = ratio + 0.5;
2773         }
2774
2775         if (forks) {
2776                 int exit_status;
2777
2778                 if (!child_finished)
2779                         kill(rec->evlist->workload.pid, SIGTERM);
2780
2781                 wait(&exit_status);
2782
2783                 if (err < 0)
2784                         status = err;
2785                 else if (WIFEXITED(exit_status))
2786                         status = WEXITSTATUS(exit_status);
2787                 else if (WIFSIGNALED(exit_status))
2788                         signr = WTERMSIG(exit_status);
2789         } else
2790                 status = err;
2791
2792         if (rec->off_cpu)
2793                 rec->bytes_written += off_cpu_write(rec->session);
2794
2795         record__read_lost_samples(rec);
2796         record__synthesize(rec, true);
2797         /* this will be recalculated during process_buildids() */
2798         rec->samples = 0;
2799
2800         if (!err) {
2801                 if (!rec->timestamp_filename) {
2802                         record__finish_output(rec);
2803                 } else {
2804                         fd = record__switch_output(rec, true);
2805                         if (fd < 0) {
2806                                 status = fd;
2807                                 goto out_delete_session;
2808                         }
2809                 }
2810         }
2811
2812         perf_hooks__invoke_record_end();
2813
2814         if (!err && !quiet) {
2815                 char samples[128];
2816                 const char *postfix = rec->timestamp_filename ?
2817                                         ".<timestamp>" : "";
2818
2819                 if (rec->samples && !rec->opts.full_auxtrace)
2820                         scnprintf(samples, sizeof(samples),
2821                                   " (%" PRIu64 " samples)", rec->samples);
2822                 else
2823                         samples[0] = '\0';
2824
2825                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2826                         perf_data__size(data) / 1024.0 / 1024.0,
2827                         data->path, postfix, samples);
2828                 if (ratio) {
2829                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2830                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2831                                         ratio);
2832                 }
2833                 fprintf(stderr, " ]\n");
2834         }
2835
2836 out_delete_session:
2837 #ifdef HAVE_EVENTFD_SUPPORT
2838         if (done_fd >= 0) {
2839                 fd = done_fd;
2840                 done_fd = -1;
2841
2842                 close(fd);
2843         }
2844 #endif
2845         zstd_fini(&session->zstd_data);
2846         perf_session__delete(session);
2847
2848         if (!opts->no_bpf_event)
2849                 evlist__stop_sb_thread(rec->sb_evlist);
2850         return status;
2851 }
2852
2853 static void callchain_debug(struct callchain_param *callchain)
2854 {
2855         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2856
2857         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2858
2859         if (callchain->record_mode == CALLCHAIN_DWARF)
2860                 pr_debug("callchain: stack dump size %d\n",
2861                          callchain->dump_size);
2862 }
2863
2864 int record_opts__parse_callchain(struct record_opts *record,
2865                                  struct callchain_param *callchain,
2866                                  const char *arg, bool unset)
2867 {
2868         int ret;
2869         callchain->enabled = !unset;
2870
2871         /* --no-call-graph */
2872         if (unset) {
2873                 callchain->record_mode = CALLCHAIN_NONE;
2874                 pr_debug("callchain: disabled\n");
2875                 return 0;
2876         }
2877
2878         ret = parse_callchain_record_opt(arg, callchain);
2879         if (!ret) {
2880                 /* Enable data address sampling for DWARF unwind. */
2881                 if (callchain->record_mode == CALLCHAIN_DWARF)
2882                         record->sample_address = true;
2883                 callchain_debug(callchain);
2884         }
2885
2886         return ret;
2887 }
2888
2889 int record_parse_callchain_opt(const struct option *opt,
2890                                const char *arg,
2891                                int unset)
2892 {
2893         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2894 }
2895
2896 int record_callchain_opt(const struct option *opt,
2897                          const char *arg __maybe_unused,
2898                          int unset __maybe_unused)
2899 {
2900         struct callchain_param *callchain = opt->value;
2901
2902         callchain->enabled = true;
2903
2904         if (callchain->record_mode == CALLCHAIN_NONE)
2905                 callchain->record_mode = CALLCHAIN_FP;
2906
2907         callchain_debug(callchain);
2908         return 0;
2909 }
2910
2911 static int perf_record_config(const char *var, const char *value, void *cb)
2912 {
2913         struct record *rec = cb;
2914
2915         if (!strcmp(var, "record.build-id")) {
2916                 if (!strcmp(value, "cache"))
2917                         rec->no_buildid_cache = false;
2918                 else if (!strcmp(value, "no-cache"))
2919                         rec->no_buildid_cache = true;
2920                 else if (!strcmp(value, "skip"))
2921                         rec->no_buildid = true;
2922                 else if (!strcmp(value, "mmap"))
2923                         rec->buildid_mmap = true;
2924                 else
2925                         return -1;
2926                 return 0;
2927         }
2928         if (!strcmp(var, "record.call-graph")) {
2929                 var = "call-graph.record-mode";
2930                 return perf_default_config(var, value, cb);
2931         }
2932 #ifdef HAVE_AIO_SUPPORT
2933         if (!strcmp(var, "record.aio")) {
2934                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2935                 if (!rec->opts.nr_cblocks)
2936                         rec->opts.nr_cblocks = nr_cblocks_default;
2937         }
2938 #endif
2939         if (!strcmp(var, "record.debuginfod")) {
2940                 rec->debuginfod.urls = strdup(value);
2941                 if (!rec->debuginfod.urls)
2942                         return -ENOMEM;
2943                 rec->debuginfod.set = true;
2944         }
2945
2946         return 0;
2947 }
2948
2949 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2950 {
2951         struct record *rec = (struct record *)opt->value;
2952
2953         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2954 }
2955
2956 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2957 {
2958         struct record_opts *opts = (struct record_opts *)opt->value;
2959
2960         if (unset || !str)
2961                 return 0;
2962
2963         if (!strcasecmp(str, "node"))
2964                 opts->affinity = PERF_AFFINITY_NODE;
2965         else if (!strcasecmp(str, "cpu"))
2966                 opts->affinity = PERF_AFFINITY_CPU;
2967
2968         return 0;
2969 }
2970
2971 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2972 {
2973         mask->nbits = nr_bits;
2974         mask->bits = bitmap_zalloc(mask->nbits);
2975         if (!mask->bits)
2976                 return -ENOMEM;
2977
2978         return 0;
2979 }
2980
2981 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2982 {
2983         bitmap_free(mask->bits);
2984         mask->nbits = 0;
2985 }
2986
2987 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2988 {
2989         int ret;
2990
2991         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2992         if (ret) {
2993                 mask->affinity.bits = NULL;
2994                 return ret;
2995         }
2996
2997         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2998         if (ret) {
2999                 record__mmap_cpu_mask_free(&mask->maps);
3000                 mask->maps.bits = NULL;
3001         }
3002
3003         return ret;
3004 }
3005
3006 static void record__thread_mask_free(struct thread_mask *mask)
3007 {
3008         record__mmap_cpu_mask_free(&mask->maps);
3009         record__mmap_cpu_mask_free(&mask->affinity);
3010 }
3011
3012 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3013 {
3014         int s;
3015         struct record_opts *opts = opt->value;
3016
3017         if (unset || !str || !strlen(str)) {
3018                 opts->threads_spec = THREAD_SPEC__CPU;
3019         } else {
3020                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3021                         if (s == THREAD_SPEC__USER) {
3022                                 opts->threads_user_spec = strdup(str);
3023                                 if (!opts->threads_user_spec)
3024                                         return -ENOMEM;
3025                                 opts->threads_spec = THREAD_SPEC__USER;
3026                                 break;
3027                         }
3028                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3029                                 opts->threads_spec = s;
3030                                 break;
3031                         }
3032                 }
3033         }
3034
3035         if (opts->threads_spec == THREAD_SPEC__USER)
3036                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3037         else
3038                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3039
3040         return 0;
3041 }
3042
3043 static int parse_output_max_size(const struct option *opt,
3044                                  const char *str, int unset)
3045 {
3046         unsigned long *s = (unsigned long *)opt->value;
3047         static struct parse_tag tags_size[] = {
3048                 { .tag  = 'B', .mult = 1       },
3049                 { .tag  = 'K', .mult = 1 << 10 },
3050                 { .tag  = 'M', .mult = 1 << 20 },
3051                 { .tag  = 'G', .mult = 1 << 30 },
3052                 { .tag  = 0 },
3053         };
3054         unsigned long val;
3055
3056         if (unset) {
3057                 *s = 0;
3058                 return 0;
3059         }
3060
3061         val = parse_tag_value(str, tags_size);
3062         if (val != (unsigned long) -1) {
3063                 *s = val;
3064                 return 0;
3065         }
3066
3067         return -1;
3068 }
3069
3070 static int record__parse_mmap_pages(const struct option *opt,
3071                                     const char *str,
3072                                     int unset __maybe_unused)
3073 {
3074         struct record_opts *opts = opt->value;
3075         char *s, *p;
3076         unsigned int mmap_pages;
3077         int ret;
3078
3079         if (!str)
3080                 return -EINVAL;
3081
3082         s = strdup(str);
3083         if (!s)
3084                 return -ENOMEM;
3085
3086         p = strchr(s, ',');
3087         if (p)
3088                 *p = '\0';
3089
3090         if (*s) {
3091                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3092                 if (ret)
3093                         goto out_free;
3094                 opts->mmap_pages = mmap_pages;
3095         }
3096
3097         if (!p) {
3098                 ret = 0;
3099                 goto out_free;
3100         }
3101
3102         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3103         if (ret)
3104                 goto out_free;
3105
3106         opts->auxtrace_mmap_pages = mmap_pages;
3107
3108 out_free:
3109         free(s);
3110         return ret;
3111 }
3112
3113 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3114 {
3115 }
3116
3117 static int parse_control_option(const struct option *opt,
3118                                 const char *str,
3119                                 int unset __maybe_unused)
3120 {
3121         struct record_opts *opts = opt->value;
3122
3123         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3124 }
3125
3126 static void switch_output_size_warn(struct record *rec)
3127 {
3128         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3129         struct switch_output *s = &rec->switch_output;
3130
3131         wakeup_size /= 2;
3132
3133         if (s->size < wakeup_size) {
3134                 char buf[100];
3135
3136                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3137                 pr_warning("WARNING: switch-output data size lower than "
3138                            "wakeup kernel buffer size (%s) "
3139                            "expect bigger perf.data sizes\n", buf);
3140         }
3141 }
3142
3143 static int switch_output_setup(struct record *rec)
3144 {
3145         struct switch_output *s = &rec->switch_output;
3146         static struct parse_tag tags_size[] = {
3147                 { .tag  = 'B', .mult = 1       },
3148                 { .tag  = 'K', .mult = 1 << 10 },
3149                 { .tag  = 'M', .mult = 1 << 20 },
3150                 { .tag  = 'G', .mult = 1 << 30 },
3151                 { .tag  = 0 },
3152         };
3153         static struct parse_tag tags_time[] = {
3154                 { .tag  = 's', .mult = 1        },
3155                 { .tag  = 'm', .mult = 60       },
3156                 { .tag  = 'h', .mult = 60*60    },
3157                 { .tag  = 'd', .mult = 60*60*24 },
3158                 { .tag  = 0 },
3159         };
3160         unsigned long val;
3161
3162         /*
3163          * If we're using --switch-output-events, then we imply its 
3164          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3165          *  thread to its parent.
3166          */
3167         if (rec->switch_output_event_set) {
3168                 if (record__threads_enabled(rec)) {
3169                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3170                         return 0;
3171                 }
3172                 goto do_signal;
3173         }
3174
3175         if (!s->set)
3176                 return 0;
3177
3178         if (record__threads_enabled(rec)) {
3179                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3180                 return 0;
3181         }
3182
3183         if (!strcmp(s->str, "signal")) {
3184 do_signal:
3185                 s->signal = true;
3186                 pr_debug("switch-output with SIGUSR2 signal\n");
3187                 goto enabled;
3188         }
3189
3190         val = parse_tag_value(s->str, tags_size);
3191         if (val != (unsigned long) -1) {
3192                 s->size = val;
3193                 pr_debug("switch-output with %s size threshold\n", s->str);
3194                 goto enabled;
3195         }
3196
3197         val = parse_tag_value(s->str, tags_time);
3198         if (val != (unsigned long) -1) {
3199                 s->time = val;
3200                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3201                          s->str, s->time);
3202                 goto enabled;
3203         }
3204
3205         return -1;
3206
3207 enabled:
3208         rec->timestamp_filename = true;
3209         s->enabled              = true;
3210
3211         if (s->size && !rec->opts.no_buffering)
3212                 switch_output_size_warn(rec);
3213
3214         return 0;
3215 }
3216
3217 static const char * const __record_usage[] = {
3218         "perf record [<options>] [<command>]",
3219         "perf record [<options>] -- <command> [<options>]",
3220         NULL
3221 };
3222 const char * const *record_usage = __record_usage;
3223
3224 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3225                                   struct perf_sample *sample, struct machine *machine)
3226 {
3227         /*
3228          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3229          * no need to add them twice.
3230          */
3231         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3232                 return 0;
3233         return perf_event__process_mmap(tool, event, sample, machine);
3234 }
3235
3236 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3237                                    struct perf_sample *sample, struct machine *machine)
3238 {
3239         /*
3240          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3241          * no need to add them twice.
3242          */
3243         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3244                 return 0;
3245
3246         return perf_event__process_mmap2(tool, event, sample, machine);
3247 }
3248
3249 static int process_timestamp_boundary(struct perf_tool *tool,
3250                                       union perf_event *event __maybe_unused,
3251                                       struct perf_sample *sample,
3252                                       struct machine *machine __maybe_unused)
3253 {
3254         struct record *rec = container_of(tool, struct record, tool);
3255
3256         set_timestamp_boundary(rec, sample->time);
3257         return 0;
3258 }
3259
3260 static int parse_record_synth_option(const struct option *opt,
3261                                      const char *str,
3262                                      int unset __maybe_unused)
3263 {
3264         struct record_opts *opts = opt->value;
3265         char *p = strdup(str);
3266
3267         if (p == NULL)
3268                 return -1;
3269
3270         opts->synth = parse_synth_opt(p);
3271         free(p);
3272
3273         if (opts->synth < 0) {
3274                 pr_err("Invalid synth option: %s\n", str);
3275                 return -1;
3276         }
3277         return 0;
3278 }
3279
3280 /*
3281  * XXX Ideally would be local to cmd_record() and passed to a record__new
3282  * because we need to have access to it in record__exit, that is called
3283  * after cmd_record() exits, but since record_options need to be accessible to
3284  * builtin-script, leave it here.
3285  *
3286  * At least we don't ouch it in all the other functions here directly.
3287  *
3288  * Just say no to tons of global variables, sigh.
3289  */
3290 static struct record record = {
3291         .opts = {
3292                 .sample_time         = true,
3293                 .mmap_pages          = UINT_MAX,
3294                 .user_freq           = UINT_MAX,
3295                 .user_interval       = ULLONG_MAX,
3296                 .freq                = 4000,
3297                 .target              = {
3298                         .uses_mmap   = true,
3299                         .default_per_cpu = true,
3300                 },
3301                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3302                 .nr_threads_synthesize = 1,
3303                 .ctl_fd              = -1,
3304                 .ctl_fd_ack          = -1,
3305                 .synth               = PERF_SYNTH_ALL,
3306         },
3307         .tool = {
3308                 .sample         = process_sample_event,
3309                 .fork           = perf_event__process_fork,
3310                 .exit           = perf_event__process_exit,
3311                 .comm           = perf_event__process_comm,
3312                 .namespaces     = perf_event__process_namespaces,
3313                 .mmap           = build_id__process_mmap,
3314                 .mmap2          = build_id__process_mmap2,
3315                 .itrace_start   = process_timestamp_boundary,
3316                 .aux            = process_timestamp_boundary,
3317                 .ordered_events = true,
3318         },
3319 };
3320
3321 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3322         "\n\t\t\t\tDefault: fp";
3323
3324 static bool dry_run;
3325
3326 static struct parse_events_option_args parse_events_option_args = {
3327         .evlistp = &record.evlist,
3328 };
3329
3330 static struct parse_events_option_args switch_output_parse_events_option_args = {
3331         .evlistp = &record.sb_evlist,
3332 };
3333
3334 /*
3335  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3336  * with it and switch to use the library functions in perf_evlist that came
3337  * from builtin-record.c, i.e. use record_opts,
3338  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3339  * using pipes, etc.
3340  */
3341 static struct option __record_options[] = {
3342         OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3343                      "event selector. use 'perf list' to list available events",
3344                      parse_events_option),
3345         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3346                      "event filter", parse_filter),
3347         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3348                            NULL, "don't record events from perf itself",
3349                            exclude_perf),
3350         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3351                     "record events on existing process id"),
3352         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3353                     "record events on existing thread id"),
3354         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3355                     "collect data with this RT SCHED_FIFO priority"),
3356         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3357                     "collect data without buffering"),
3358         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3359                     "collect raw sample records from all opened counters"),
3360         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3361                             "system-wide collection from all CPUs"),
3362         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3363                     "list of cpus to monitor"),
3364         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3365         OPT_STRING('o', "output", &record.data.path, "file",
3366                     "output file name"),
3367         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3368                         &record.opts.no_inherit_set,
3369                         "child tasks do not inherit counters"),
3370         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3371                     "synthesize non-sample events at the end of output"),
3372         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3373         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3374         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3375                     "Fail if the specified frequency can't be used"),
3376         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3377                      "profile at this frequency",
3378                       record__parse_freq),
3379         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3380                      "number of mmap data pages and AUX area tracing mmap pages",
3381                      record__parse_mmap_pages),
3382         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3383                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3384                      record__mmap_flush_parse),
3385         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3386                            NULL, "enables call-graph recording" ,
3387                            &record_callchain_opt),
3388         OPT_CALLBACK(0, "call-graph", &record.opts,
3389                      "record_mode[,record_size]", record_callchain_help,
3390                      &record_parse_callchain_opt),
3391         OPT_INCR('v', "verbose", &verbose,
3392                     "be more verbose (show counter open errors, etc)"),
3393         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3394         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3395                     "per thread counts"),
3396         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3397         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3398                     "Record the sample physical addresses"),
3399         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3400                     "Record the sampled data address data page size"),
3401         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3402                     "Record the sampled code address (ip) page size"),
3403         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3404         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3405                     "Record the sample identifier"),
3406         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3407                         &record.opts.sample_time_set,
3408                         "Record the sample timestamps"),
3409         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3410                         "Record the sample period"),
3411         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3412                     "don't sample"),
3413         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3414                         &record.no_buildid_cache_set,
3415                         "do not update the buildid cache"),
3416         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3417                         &record.no_buildid_set,
3418                         "do not collect buildids in perf.data"),
3419         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3420                      "monitor event in cgroup name only",
3421                      parse_cgroups),
3422         OPT_CALLBACK('D', "delay", &record, "ms",
3423                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3424                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3425                      record__parse_event_enable_time),
3426         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3427         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3428                    "user to profile"),
3429
3430         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3431                      "branch any", "sample any taken branches",
3432                      parse_branch_stack),
3433
3434         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3435                      "branch filter mask", "branch stack filter modes",
3436                      parse_branch_stack),
3437         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3438                     "sample by weight (on special events only)"),
3439         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3440                     "sample transaction flags (special events only)"),
3441         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3442                     "use per-thread mmaps"),
3443         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3444                     "sample selected machine registers on interrupt,"
3445                     " use '-I?' to list register names", parse_intr_regs),
3446         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3447                     "sample selected machine registers on interrupt,"
3448                     " use '--user-regs=?' to list register names", parse_user_regs),
3449         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3450                     "Record running/enabled time of read (:S) events"),
3451         OPT_CALLBACK('k', "clockid", &record.opts,
3452         "clockid", "clockid to use for events, see clock_gettime()",
3453         parse_clockid),
3454         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3455                           "opts", "AUX area tracing Snapshot Mode", ""),
3456         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3457                           "opts", "sample AUX area", ""),
3458         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3459                         "per thread proc mmap processing timeout in ms"),
3460         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3461                     "Record namespaces events"),
3462         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3463                     "Record cgroup events"),
3464         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3465                         &record.opts.record_switch_events_set,
3466                         "Record context switch events"),
3467         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3468                          "Configure all used events to run in kernel space.",
3469                          PARSE_OPT_EXCLUSIVE),
3470         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3471                          "Configure all used events to run in user space.",
3472                          PARSE_OPT_EXCLUSIVE),
3473         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3474                     "collect kernel callchains"),
3475         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3476                     "collect user callchains"),
3477         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3478                    "file", "vmlinux pathname"),
3479         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3480                     "Record build-id of all DSOs regardless of hits"),
3481         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3482                     "Record build-id in map events"),
3483         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3484                     "append timestamp to output filename"),
3485         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3486                     "Record timestamp boundary (time of first/last samples)"),
3487         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3488                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3489                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3490                           "signal"),
3491         OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3492                          &record.switch_output_event_set, "switch output event",
3493                          "switch output event selector. use 'perf list' to list available events",
3494                          parse_events_option_new_evlist),
3495         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3496                    "Limit number of switch output generated files"),
3497         OPT_BOOLEAN(0, "dry-run", &dry_run,
3498                     "Parse options then exit"),
3499 #ifdef HAVE_AIO_SUPPORT
3500         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3501                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3502                      record__aio_parse),
3503 #endif
3504         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3505                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3506                      record__parse_affinity),
3507 #ifdef HAVE_ZSTD_SUPPORT
3508         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3509                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3510                             record__parse_comp_level),
3511 #endif
3512         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3513                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3514         OPT_UINTEGER(0, "num-thread-synthesize",
3515                      &record.opts.nr_threads_synthesize,
3516                      "number of threads to run for event synthesis"),
3517 #ifdef HAVE_LIBPFM
3518         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3519                 "libpfm4 event selector. use 'perf list' to list available events",
3520                 parse_libpfm_events_option),
3521 #endif
3522         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3523                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3524                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3525                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3526                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3527                       parse_control_option),
3528         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3529                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3530         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3531                           &record.debuginfod.set, "debuginfod urls",
3532                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3533                           "system"),
3534         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3535                             "write collected trace data into several data files using parallel threads",
3536                             record__parse_threads),
3537         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3538         OPT_END()
3539 };
3540
3541 struct option *record_options = __record_options;
3542
3543 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3544 {
3545         struct perf_cpu cpu;
3546         int idx;
3547
3548         if (cpu_map__is_dummy(cpus))
3549                 return 0;
3550
3551         perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3552                 if (cpu.cpu == -1)
3553                         continue;
3554                 /* Return ENODEV is input cpu is greater than max cpu */
3555                 if ((unsigned long)cpu.cpu > mask->nbits)
3556                         return -ENODEV;
3557                 __set_bit(cpu.cpu, mask->bits);
3558         }
3559
3560         return 0;
3561 }
3562
3563 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3564 {
3565         struct perf_cpu_map *cpus;
3566
3567         cpus = perf_cpu_map__new(mask_spec);
3568         if (!cpus)
3569                 return -ENOMEM;
3570
3571         bitmap_zero(mask->bits, mask->nbits);
3572         if (record__mmap_cpu_mask_init(mask, cpus))
3573                 return -ENODEV;
3574
3575         perf_cpu_map__put(cpus);
3576
3577         return 0;
3578 }
3579
3580 static void record__free_thread_masks(struct record *rec, int nr_threads)
3581 {
3582         int t;
3583
3584         if (rec->thread_masks)
3585                 for (t = 0; t < nr_threads; t++)
3586                         record__thread_mask_free(&rec->thread_masks[t]);
3587
3588         zfree(&rec->thread_masks);
3589 }
3590
3591 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3592 {
3593         int t, ret;
3594
3595         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3596         if (!rec->thread_masks) {
3597                 pr_err("Failed to allocate thread masks\n");
3598                 return -ENOMEM;
3599         }
3600
3601         for (t = 0; t < nr_threads; t++) {
3602                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3603                 if (ret) {
3604                         pr_err("Failed to allocate thread masks[%d]\n", t);
3605                         goto out_free;
3606                 }
3607         }
3608
3609         return 0;
3610
3611 out_free:
3612         record__free_thread_masks(rec, nr_threads);
3613
3614         return ret;
3615 }
3616
3617 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3618 {
3619         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3620
3621         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3622         if (ret)
3623                 return ret;
3624
3625         rec->nr_threads = nr_cpus;
3626         pr_debug("nr_threads: %d\n", rec->nr_threads);
3627
3628         for (t = 0; t < rec->nr_threads; t++) {
3629                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3630                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3631                 if (verbose > 0) {
3632                         pr_debug("thread_masks[%d]: ", t);
3633                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3634                         pr_debug("thread_masks[%d]: ", t);
3635                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3636                 }
3637         }
3638
3639         return 0;
3640 }
3641
3642 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3643                                           const char **maps_spec, const char **affinity_spec,
3644                                           u32 nr_spec)
3645 {
3646         u32 s;
3647         int ret = 0, t = 0;
3648         struct mmap_cpu_mask cpus_mask;
3649         struct thread_mask thread_mask, full_mask, *thread_masks;
3650
3651         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3652         if (ret) {
3653                 pr_err("Failed to allocate CPUs mask\n");
3654                 return ret;
3655         }
3656
3657         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3658         if (ret) {
3659                 pr_err("Failed to init cpu mask\n");
3660                 goto out_free_cpu_mask;
3661         }
3662
3663         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3664         if (ret) {
3665                 pr_err("Failed to allocate full mask\n");
3666                 goto out_free_cpu_mask;
3667         }
3668
3669         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3670         if (ret) {
3671                 pr_err("Failed to allocate thread mask\n");
3672                 goto out_free_full_and_cpu_masks;
3673         }
3674
3675         for (s = 0; s < nr_spec; s++) {
3676                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3677                 if (ret) {
3678                         pr_err("Failed to initialize maps thread mask\n");
3679                         goto out_free;
3680                 }
3681                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3682                 if (ret) {
3683                         pr_err("Failed to initialize affinity thread mask\n");
3684                         goto out_free;
3685                 }
3686
3687                 /* ignore invalid CPUs but do not allow empty masks */
3688                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3689                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3690                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3691                         ret = -EINVAL;
3692                         goto out_free;
3693                 }
3694                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3695                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3696                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3697                         ret = -EINVAL;
3698                         goto out_free;
3699                 }
3700
3701                 /* do not allow intersection with other masks (full_mask) */
3702                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3703                                       thread_mask.maps.nbits)) {
3704                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3705                         ret = -EINVAL;
3706                         goto out_free;
3707                 }
3708                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3709                                       thread_mask.affinity.nbits)) {
3710                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3711                         ret = -EINVAL;
3712                         goto out_free;
3713                 }
3714
3715                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3716                           thread_mask.maps.bits, full_mask.maps.nbits);
3717                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3718                           thread_mask.affinity.bits, full_mask.maps.nbits);
3719
3720                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3721                 if (!thread_masks) {
3722                         pr_err("Failed to reallocate thread masks\n");
3723                         ret = -ENOMEM;
3724                         goto out_free;
3725                 }
3726                 rec->thread_masks = thread_masks;
3727                 rec->thread_masks[t] = thread_mask;
3728                 if (verbose > 0) {
3729                         pr_debug("thread_masks[%d]: ", t);
3730                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3731                         pr_debug("thread_masks[%d]: ", t);
3732                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3733                 }
3734                 t++;
3735                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3736                 if (ret) {
3737                         pr_err("Failed to allocate thread mask\n");
3738                         goto out_free_full_and_cpu_masks;
3739                 }
3740         }
3741         rec->nr_threads = t;
3742         pr_debug("nr_threads: %d\n", rec->nr_threads);
3743         if (!rec->nr_threads)
3744                 ret = -EINVAL;
3745
3746 out_free:
3747         record__thread_mask_free(&thread_mask);
3748 out_free_full_and_cpu_masks:
3749         record__thread_mask_free(&full_mask);
3750 out_free_cpu_mask:
3751         record__mmap_cpu_mask_free(&cpus_mask);
3752
3753         return ret;
3754 }
3755
3756 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3757 {
3758         int ret;
3759         struct cpu_topology *topo;
3760
3761         topo = cpu_topology__new();
3762         if (!topo) {
3763                 pr_err("Failed to allocate CPU topology\n");
3764                 return -ENOMEM;
3765         }
3766
3767         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3768                                              topo->core_cpus_list, topo->core_cpus_lists);
3769         cpu_topology__delete(topo);
3770
3771         return ret;
3772 }
3773
3774 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3775 {
3776         int ret;
3777         struct cpu_topology *topo;
3778
3779         topo = cpu_topology__new();
3780         if (!topo) {
3781                 pr_err("Failed to allocate CPU topology\n");
3782                 return -ENOMEM;
3783         }
3784
3785         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3786                                              topo->package_cpus_list, topo->package_cpus_lists);
3787         cpu_topology__delete(topo);
3788
3789         return ret;
3790 }
3791
3792 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3793 {
3794         u32 s;
3795         int ret;
3796         const char **spec;
3797         struct numa_topology *topo;
3798
3799         topo = numa_topology__new();
3800         if (!topo) {
3801                 pr_err("Failed to allocate NUMA topology\n");
3802                 return -ENOMEM;
3803         }
3804
3805         spec = zalloc(topo->nr * sizeof(char *));
3806         if (!spec) {
3807                 pr_err("Failed to allocate NUMA spec\n");
3808                 ret = -ENOMEM;
3809                 goto out_delete_topo;
3810         }
3811         for (s = 0; s < topo->nr; s++)
3812                 spec[s] = topo->nodes[s].cpus;
3813
3814         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3815
3816         zfree(&spec);
3817
3818 out_delete_topo:
3819         numa_topology__delete(topo);
3820
3821         return ret;
3822 }
3823
3824 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3825 {
3826         int t, ret;
3827         u32 s, nr_spec = 0;
3828         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3829         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3830
3831         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3832                 spec = strtok_r(user_spec, ":", &spec_ptr);
3833                 if (spec == NULL)
3834                         break;
3835                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3836                 mask = strtok_r(spec, "/", &mask_ptr);
3837                 if (mask == NULL)
3838                         break;
3839                 pr_debug2("  maps mask: %s\n", mask);
3840                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3841                 if (!tmp_spec) {
3842                         pr_err("Failed to reallocate maps spec\n");
3843                         ret = -ENOMEM;
3844                         goto out_free;
3845                 }
3846                 maps_spec = tmp_spec;
3847                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3848                 if (!maps_spec[nr_spec]) {
3849                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3850                         ret = -ENOMEM;
3851                         goto out_free;
3852                 }
3853                 mask = strtok_r(NULL, "/", &mask_ptr);
3854                 if (mask == NULL) {
3855                         pr_err("Invalid thread maps or affinity specs\n");
3856                         ret = -EINVAL;
3857                         goto out_free;
3858                 }
3859                 pr_debug2("  affinity mask: %s\n", mask);
3860                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3861                 if (!tmp_spec) {
3862                         pr_err("Failed to reallocate affinity spec\n");
3863                         ret = -ENOMEM;
3864                         goto out_free;
3865                 }
3866                 affinity_spec = tmp_spec;
3867                 affinity_spec[nr_spec] = strdup(mask);
3868                 if (!affinity_spec[nr_spec]) {
3869                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3870                         ret = -ENOMEM;
3871                         goto out_free;
3872                 }
3873                 dup_mask = NULL;
3874                 nr_spec++;
3875         }
3876
3877         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3878                                              (const char **)affinity_spec, nr_spec);
3879
3880 out_free:
3881         free(dup_mask);
3882         for (s = 0; s < nr_spec; s++) {
3883                 if (maps_spec)
3884                         free(maps_spec[s]);
3885                 if (affinity_spec)
3886                         free(affinity_spec[s]);
3887         }
3888         free(affinity_spec);
3889         free(maps_spec);
3890
3891         return ret;
3892 }
3893
3894 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3895 {
3896         int ret;
3897
3898         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3899         if (ret)
3900                 return ret;
3901
3902         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3903                 return -ENODEV;
3904
3905         rec->nr_threads = 1;
3906
3907         return 0;
3908 }
3909
3910 static int record__init_thread_masks(struct record *rec)
3911 {
3912         int ret = 0;
3913         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3914
3915         if (!record__threads_enabled(rec))
3916                 return record__init_thread_default_masks(rec, cpus);
3917
3918         if (evlist__per_thread(rec->evlist)) {
3919                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3920                 return -EINVAL;
3921         }
3922
3923         switch (rec->opts.threads_spec) {
3924         case THREAD_SPEC__CPU:
3925                 ret = record__init_thread_cpu_masks(rec, cpus);
3926                 break;
3927         case THREAD_SPEC__CORE:
3928                 ret = record__init_thread_core_masks(rec, cpus);
3929                 break;
3930         case THREAD_SPEC__PACKAGE:
3931                 ret = record__init_thread_package_masks(rec, cpus);
3932                 break;
3933         case THREAD_SPEC__NUMA:
3934                 ret = record__init_thread_numa_masks(rec, cpus);
3935                 break;
3936         case THREAD_SPEC__USER:
3937                 ret = record__init_thread_user_masks(rec, cpus);
3938                 break;
3939         default:
3940                 break;
3941         }
3942
3943         return ret;
3944 }
3945
3946 int cmd_record(int argc, const char **argv)
3947 {
3948         int err;
3949         struct record *rec = &record;
3950         char errbuf[BUFSIZ];
3951
3952         setlocale(LC_ALL, "");
3953
3954 #ifndef HAVE_BPF_SKEL
3955 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3956         set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3957 # undef set_nobuild
3958 #endif
3959
3960         rec->opts.affinity = PERF_AFFINITY_SYS;
3961
3962         rec->evlist = evlist__new();
3963         if (rec->evlist == NULL)
3964                 return -ENOMEM;
3965
3966         err = perf_config(perf_record_config, rec);
3967         if (err)
3968                 return err;
3969
3970         argc = parse_options(argc, argv, record_options, record_usage,
3971                             PARSE_OPT_STOP_AT_NON_OPTION);
3972         if (quiet)
3973                 perf_quiet_option();
3974
3975         err = symbol__validate_sym_arguments();
3976         if (err)
3977                 return err;
3978
3979         perf_debuginfod_setup(&record.debuginfod);
3980
3981         /* Make system wide (-a) the default target. */
3982         if (!argc && target__none(&rec->opts.target))
3983                 rec->opts.target.system_wide = true;
3984
3985         if (nr_cgroups && !rec->opts.target.system_wide) {
3986                 usage_with_options_msg(record_usage, record_options,
3987                         "cgroup monitoring only available in system-wide mode");
3988
3989         }
3990
3991         if (rec->buildid_mmap) {
3992                 if (!perf_can_record_build_id()) {
3993                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3994                         err = -EINVAL;
3995                         goto out_opts;
3996                 }
3997                 pr_debug("Enabling build id in mmap2 events.\n");
3998                 /* Enable mmap build id synthesizing. */
3999                 symbol_conf.buildid_mmap2 = true;
4000                 /* Enable perf_event_attr::build_id bit. */
4001                 rec->opts.build_id = true;
4002                 /* Disable build id cache. */
4003                 rec->no_buildid = true;
4004         }
4005
4006         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4007                 pr_err("Kernel has no cgroup sampling support.\n");
4008                 err = -EINVAL;
4009                 goto out_opts;
4010         }
4011
4012         if (rec->opts.kcore)
4013                 rec->opts.text_poke = true;
4014
4015         if (rec->opts.kcore || record__threads_enabled(rec))
4016                 rec->data.is_dir = true;
4017
4018         if (record__threads_enabled(rec)) {
4019                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4020                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4021                         goto out_opts;
4022                 }
4023                 if (record__aio_enabled(rec)) {
4024                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4025                         goto out_opts;
4026                 }
4027         }
4028
4029         if (rec->opts.comp_level != 0) {
4030                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4031                 rec->no_buildid = true;
4032         }
4033
4034         if (rec->opts.record_switch_events &&
4035             !perf_can_record_switch_events()) {
4036                 ui__error("kernel does not support recording context switch events\n");
4037                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4038                 err = -EINVAL;
4039                 goto out_opts;
4040         }
4041
4042         if (switch_output_setup(rec)) {
4043                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4044                 err = -EINVAL;
4045                 goto out_opts;
4046         }
4047
4048         if (rec->switch_output.time) {
4049                 signal(SIGALRM, alarm_sig_handler);
4050                 alarm(rec->switch_output.time);
4051         }
4052
4053         if (rec->switch_output.num_files) {
4054                 rec->switch_output.filenames = calloc(sizeof(char *),
4055                                                       rec->switch_output.num_files);
4056                 if (!rec->switch_output.filenames) {
4057                         err = -EINVAL;
4058                         goto out_opts;
4059                 }
4060         }
4061
4062         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4063                 rec->timestamp_filename = false;
4064                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4065         }
4066
4067         /*
4068          * Allow aliases to facilitate the lookup of symbols for address
4069          * filters. Refer to auxtrace_parse_filters().
4070          */
4071         symbol_conf.allow_aliases = true;
4072
4073         symbol__init(NULL);
4074
4075         err = record__auxtrace_init(rec);
4076         if (err)
4077                 goto out;
4078
4079         if (dry_run)
4080                 goto out;
4081
4082         err = -ENOMEM;
4083
4084         if (rec->no_buildid_cache || rec->no_buildid) {
4085                 disable_buildid_cache();
4086         } else if (rec->switch_output.enabled) {
4087                 /*
4088                  * In 'perf record --switch-output', disable buildid
4089                  * generation by default to reduce data file switching
4090                  * overhead. Still generate buildid if they are required
4091                  * explicitly using
4092                  *
4093                  *  perf record --switch-output --no-no-buildid \
4094                  *              --no-no-buildid-cache
4095                  *
4096                  * Following code equals to:
4097                  *
4098                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4099                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4100                  *         disable_buildid_cache();
4101                  */
4102                 bool disable = true;
4103
4104                 if (rec->no_buildid_set && !rec->no_buildid)
4105                         disable = false;
4106                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4107                         disable = false;
4108                 if (disable) {
4109                         rec->no_buildid = true;
4110                         rec->no_buildid_cache = true;
4111                         disable_buildid_cache();
4112                 }
4113         }
4114
4115         if (record.opts.overwrite)
4116                 record.opts.tail_synthesize = true;
4117
4118         if (rec->evlist->core.nr_entries == 0) {
4119                 bool can_profile_kernel = perf_event_paranoid_check(1);
4120
4121                 err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4122                 if (err)
4123                         goto out;
4124         }
4125
4126         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4127                 rec->opts.no_inherit = true;
4128
4129         err = target__validate(&rec->opts.target);
4130         if (err) {
4131                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4132                 ui__warning("%s\n", errbuf);
4133         }
4134
4135         err = target__parse_uid(&rec->opts.target);
4136         if (err) {
4137                 int saved_errno = errno;
4138
4139                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4140                 ui__error("%s", errbuf);
4141
4142                 err = -saved_errno;
4143                 goto out;
4144         }
4145
4146         /* Enable ignoring missing threads when -u/-p option is defined. */
4147         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4148
4149         evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4150
4151         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4152                 arch__add_leaf_frame_record_opts(&rec->opts);
4153
4154         err = -ENOMEM;
4155         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4156                 if (rec->opts.target.pid != NULL) {
4157                         pr_err("Couldn't create thread/CPU maps: %s\n",
4158                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4159                         goto out;
4160                 }
4161                 else
4162                         usage_with_options(record_usage, record_options);
4163         }
4164
4165         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4166         if (err)
4167                 goto out;
4168
4169         /*
4170          * We take all buildids when the file contains
4171          * AUX area tracing data because we do not decode the
4172          * trace because it would take too long.
4173          */
4174         if (rec->opts.full_auxtrace)
4175                 rec->buildid_all = true;
4176
4177         if (rec->opts.text_poke) {
4178                 err = record__config_text_poke(rec->evlist);
4179                 if (err) {
4180                         pr_err("record__config_text_poke failed, error %d\n", err);
4181                         goto out;
4182                 }
4183         }
4184
4185         if (rec->off_cpu) {
4186                 err = record__config_off_cpu(rec);
4187                 if (err) {
4188                         pr_err("record__config_off_cpu failed, error %d\n", err);
4189                         goto out;
4190                 }
4191         }
4192
4193         if (record_opts__config(&rec->opts)) {
4194                 err = -EINVAL;
4195                 goto out;
4196         }
4197
4198         err = record__init_thread_masks(rec);
4199         if (err) {
4200                 pr_err("Failed to initialize parallel data streaming masks\n");
4201                 goto out;
4202         }
4203
4204         if (rec->opts.nr_cblocks > nr_cblocks_max)
4205                 rec->opts.nr_cblocks = nr_cblocks_max;
4206         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4207
4208         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4209         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4210
4211         if (rec->opts.comp_level > comp_level_max)
4212                 rec->opts.comp_level = comp_level_max;
4213         pr_debug("comp level: %d\n", rec->opts.comp_level);
4214
4215         err = __cmd_record(&record, argc, argv);
4216 out:
4217         evlist__delete(rec->evlist);
4218         symbol__exit();
4219         auxtrace_record__free(rec->itr);
4220 out_opts:
4221         record__free_thread_masks(rec, rec->nr_threads);
4222         rec->nr_threads = 0;
4223         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4224         return err;
4225 }
4226
4227 static void snapshot_sig_handler(int sig __maybe_unused)
4228 {
4229         struct record *rec = &record;
4230
4231         hit_auxtrace_snapshot_trigger(rec);
4232
4233         if (switch_output_signal(rec))
4234                 trigger_hit(&switch_output_trigger);
4235 }
4236
4237 static void alarm_sig_handler(int sig __maybe_unused)
4238 {
4239         struct record *rec = &record;
4240
4241         if (switch_output_time(rec))
4242                 trigger_hit(&switch_output_trigger);
4243 }