perf/cgroup: Fix child event counting bug
[platform/kernel/linux-rpi.git] / kernel / events / core.c
1 /*
2  * Performance events core code:
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8  *
9  * For licensing details see kernel-base/COPYING
10  */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/tick.h>
22 #include <linux/sysfs.h>
23 #include <linux/dcache.h>
24 #include <linux/percpu.h>
25 #include <linux/ptrace.h>
26 #include <linux/reboot.h>
27 #include <linux/vmstat.h>
28 #include <linux/device.h>
29 #include <linux/export.h>
30 #include <linux/vmalloc.h>
31 #include <linux/hardirq.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/cgroup.h>
38 #include <linux/perf_event.h>
39 #include <linux/trace_events.h>
40 #include <linux/hw_breakpoint.h>
41 #include <linux/mm_types.h>
42 #include <linux/module.h>
43 #include <linux/mman.h>
44 #include <linux/compat.h>
45 #include <linux/bpf.h>
46 #include <linux/filter.h>
47 #include <linux/namei.h>
48 #include <linux/parser.h>
49 #include <linux/sched/clock.h>
50 #include <linux/sched/mm.h>
51 #include <linux/proc_ns.h>
52 #include <linux/mount.h>
53
54 #include "internal.h"
55
56 #include <asm/irq_regs.h>
57
58 typedef int (*remote_function_f)(void *);
59
60 struct remote_function_call {
61         struct task_struct      *p;
62         remote_function_f       func;
63         void                    *info;
64         int                     ret;
65 };
66
67 static void remote_function(void *data)
68 {
69         struct remote_function_call *tfc = data;
70         struct task_struct *p = tfc->p;
71
72         if (p) {
73                 /* -EAGAIN */
74                 if (task_cpu(p) != smp_processor_id())
75                         return;
76
77                 /*
78                  * Now that we're on right CPU with IRQs disabled, we can test
79                  * if we hit the right task without races.
80                  */
81
82                 tfc->ret = -ESRCH; /* No such (running) process */
83                 if (p != current)
84                         return;
85         }
86
87         tfc->ret = tfc->func(tfc->info);
88 }
89
90 /**
91  * task_function_call - call a function on the cpu on which a task runs
92  * @p:          the task to evaluate
93  * @func:       the function to be called
94  * @info:       the function call argument
95  *
96  * Calls the function @func when the task is currently running. This might
97  * be on the current CPU, which just calls the function directly
98  *
99  * returns: @func return value, or
100  *          -ESRCH  - when the process isn't running
101  *          -EAGAIN - when the process moved away
102  */
103 static int
104 task_function_call(struct task_struct *p, remote_function_f func, void *info)
105 {
106         struct remote_function_call data = {
107                 .p      = p,
108                 .func   = func,
109                 .info   = info,
110                 .ret    = -EAGAIN,
111         };
112         int ret;
113
114         do {
115                 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
116                 if (!ret)
117                         ret = data.ret;
118         } while (ret == -EAGAIN);
119
120         return ret;
121 }
122
123 /**
124  * cpu_function_call - call a function on the cpu
125  * @func:       the function to be called
126  * @info:       the function call argument
127  *
128  * Calls the function @func on the remote cpu.
129  *
130  * returns: @func return value or -ENXIO when the cpu is offline
131  */
132 static int cpu_function_call(int cpu, remote_function_f func, void *info)
133 {
134         struct remote_function_call data = {
135                 .p      = NULL,
136                 .func   = func,
137                 .info   = info,
138                 .ret    = -ENXIO, /* No such CPU */
139         };
140
141         smp_call_function_single(cpu, remote_function, &data, 1);
142
143         return data.ret;
144 }
145
146 static inline struct perf_cpu_context *
147 __get_cpu_context(struct perf_event_context *ctx)
148 {
149         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
150 }
151
152 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
153                           struct perf_event_context *ctx)
154 {
155         raw_spin_lock(&cpuctx->ctx.lock);
156         if (ctx)
157                 raw_spin_lock(&ctx->lock);
158 }
159
160 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
161                             struct perf_event_context *ctx)
162 {
163         if (ctx)
164                 raw_spin_unlock(&ctx->lock);
165         raw_spin_unlock(&cpuctx->ctx.lock);
166 }
167
168 #define TASK_TOMBSTONE ((void *)-1L)
169
170 static bool is_kernel_event(struct perf_event *event)
171 {
172         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
173 }
174
175 /*
176  * On task ctx scheduling...
177  *
178  * When !ctx->nr_events a task context will not be scheduled. This means
179  * we can disable the scheduler hooks (for performance) without leaving
180  * pending task ctx state.
181  *
182  * This however results in two special cases:
183  *
184  *  - removing the last event from a task ctx; this is relatively straight
185  *    forward and is done in __perf_remove_from_context.
186  *
187  *  - adding the first event to a task ctx; this is tricky because we cannot
188  *    rely on ctx->is_active and therefore cannot use event_function_call().
189  *    See perf_install_in_context().
190  *
191  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
192  */
193
194 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
195                         struct perf_event_context *, void *);
196
197 struct event_function_struct {
198         struct perf_event *event;
199         event_f func;
200         void *data;
201 };
202
203 static int event_function(void *info)
204 {
205         struct event_function_struct *efs = info;
206         struct perf_event *event = efs->event;
207         struct perf_event_context *ctx = event->ctx;
208         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
209         struct perf_event_context *task_ctx = cpuctx->task_ctx;
210         int ret = 0;
211
212         WARN_ON_ONCE(!irqs_disabled());
213
214         perf_ctx_lock(cpuctx, task_ctx);
215         /*
216          * Since we do the IPI call without holding ctx->lock things can have
217          * changed, double check we hit the task we set out to hit.
218          */
219         if (ctx->task) {
220                 if (ctx->task != current) {
221                         ret = -ESRCH;
222                         goto unlock;
223                 }
224
225                 /*
226                  * We only use event_function_call() on established contexts,
227                  * and event_function() is only ever called when active (or
228                  * rather, we'll have bailed in task_function_call() or the
229                  * above ctx->task != current test), therefore we must have
230                  * ctx->is_active here.
231                  */
232                 WARN_ON_ONCE(!ctx->is_active);
233                 /*
234                  * And since we have ctx->is_active, cpuctx->task_ctx must
235                  * match.
236                  */
237                 WARN_ON_ONCE(task_ctx != ctx);
238         } else {
239                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
240         }
241
242         efs->func(event, cpuctx, ctx, efs->data);
243 unlock:
244         perf_ctx_unlock(cpuctx, task_ctx);
245
246         return ret;
247 }
248
249 static void event_function_call(struct perf_event *event, event_f func, void *data)
250 {
251         struct perf_event_context *ctx = event->ctx;
252         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
253         struct event_function_struct efs = {
254                 .event = event,
255                 .func = func,
256                 .data = data,
257         };
258
259         if (!event->parent) {
260                 /*
261                  * If this is a !child event, we must hold ctx::mutex to
262                  * stabilize the the event->ctx relation. See
263                  * perf_event_ctx_lock().
264                  */
265                 lockdep_assert_held(&ctx->mutex);
266         }
267
268         if (!task) {
269                 cpu_function_call(event->cpu, event_function, &efs);
270                 return;
271         }
272
273         if (task == TASK_TOMBSTONE)
274                 return;
275
276 again:
277         if (!task_function_call(task, event_function, &efs))
278                 return;
279
280         raw_spin_lock_irq(&ctx->lock);
281         /*
282          * Reload the task pointer, it might have been changed by
283          * a concurrent perf_event_context_sched_out().
284          */
285         task = ctx->task;
286         if (task == TASK_TOMBSTONE) {
287                 raw_spin_unlock_irq(&ctx->lock);
288                 return;
289         }
290         if (ctx->is_active) {
291                 raw_spin_unlock_irq(&ctx->lock);
292                 goto again;
293         }
294         func(event, NULL, ctx, data);
295         raw_spin_unlock_irq(&ctx->lock);
296 }
297
298 /*
299  * Similar to event_function_call() + event_function(), but hard assumes IRQs
300  * are already disabled and we're on the right CPU.
301  */
302 static void event_function_local(struct perf_event *event, event_f func, void *data)
303 {
304         struct perf_event_context *ctx = event->ctx;
305         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
306         struct task_struct *task = READ_ONCE(ctx->task);
307         struct perf_event_context *task_ctx = NULL;
308
309         WARN_ON_ONCE(!irqs_disabled());
310
311         if (task) {
312                 if (task == TASK_TOMBSTONE)
313                         return;
314
315                 task_ctx = ctx;
316         }
317
318         perf_ctx_lock(cpuctx, task_ctx);
319
320         task = ctx->task;
321         if (task == TASK_TOMBSTONE)
322                 goto unlock;
323
324         if (task) {
325                 /*
326                  * We must be either inactive or active and the right task,
327                  * otherwise we're screwed, since we cannot IPI to somewhere
328                  * else.
329                  */
330                 if (ctx->is_active) {
331                         if (WARN_ON_ONCE(task != current))
332                                 goto unlock;
333
334                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
335                                 goto unlock;
336                 }
337         } else {
338                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
339         }
340
341         func(event, cpuctx, ctx, data);
342 unlock:
343         perf_ctx_unlock(cpuctx, task_ctx);
344 }
345
346 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
347                        PERF_FLAG_FD_OUTPUT  |\
348                        PERF_FLAG_PID_CGROUP |\
349                        PERF_FLAG_FD_CLOEXEC)
350
351 /*
352  * branch priv levels that need permission checks
353  */
354 #define PERF_SAMPLE_BRANCH_PERM_PLM \
355         (PERF_SAMPLE_BRANCH_KERNEL |\
356          PERF_SAMPLE_BRANCH_HV)
357
358 enum event_type_t {
359         EVENT_FLEXIBLE = 0x1,
360         EVENT_PINNED = 0x2,
361         EVENT_TIME = 0x4,
362         /* see ctx_resched() for details */
363         EVENT_CPU = 0x8,
364         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
365 };
366
367 /*
368  * perf_sched_events : >0 events exist
369  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
370  */
371
372 static void perf_sched_delayed(struct work_struct *work);
373 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
374 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
375 static DEFINE_MUTEX(perf_sched_mutex);
376 static atomic_t perf_sched_count;
377
378 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
379 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
380 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
381
382 static atomic_t nr_mmap_events __read_mostly;
383 static atomic_t nr_comm_events __read_mostly;
384 static atomic_t nr_namespaces_events __read_mostly;
385 static atomic_t nr_task_events __read_mostly;
386 static atomic_t nr_freq_events __read_mostly;
387 static atomic_t nr_switch_events __read_mostly;
388
389 static LIST_HEAD(pmus);
390 static DEFINE_MUTEX(pmus_lock);
391 static struct srcu_struct pmus_srcu;
392 static cpumask_var_t perf_online_mask;
393
394 /*
395  * perf event paranoia level:
396  *  -1 - not paranoid at all
397  *   0 - disallow raw tracepoint access for unpriv
398  *   1 - disallow cpu events for unpriv
399  *   2 - disallow kernel profiling for unpriv
400  */
401 int sysctl_perf_event_paranoid __read_mostly = 2;
402
403 /* Minimum for 512 kiB + 1 user control page */
404 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
405
406 /*
407  * max perf event sample rate
408  */
409 #define DEFAULT_MAX_SAMPLE_RATE         100000
410 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
411 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
412
413 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
414
415 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
416 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
417
418 static int perf_sample_allowed_ns __read_mostly =
419         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
420
421 static void update_perf_cpu_limits(void)
422 {
423         u64 tmp = perf_sample_period_ns;
424
425         tmp *= sysctl_perf_cpu_time_max_percent;
426         tmp = div_u64(tmp, 100);
427         if (!tmp)
428                 tmp = 1;
429
430         WRITE_ONCE(perf_sample_allowed_ns, tmp);
431 }
432
433 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
434
435 int perf_proc_update_handler(struct ctl_table *table, int write,
436                 void __user *buffer, size_t *lenp,
437                 loff_t *ppos)
438 {
439         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
440
441         if (ret || !write)
442                 return ret;
443
444         /*
445          * If throttling is disabled don't allow the write:
446          */
447         if (sysctl_perf_cpu_time_max_percent == 100 ||
448             sysctl_perf_cpu_time_max_percent == 0)
449                 return -EINVAL;
450
451         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
452         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
453         update_perf_cpu_limits();
454
455         return 0;
456 }
457
458 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
459
460 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
461                                 void __user *buffer, size_t *lenp,
462                                 loff_t *ppos)
463 {
464         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
465
466         if (ret || !write)
467                 return ret;
468
469         if (sysctl_perf_cpu_time_max_percent == 100 ||
470             sysctl_perf_cpu_time_max_percent == 0) {
471                 printk(KERN_WARNING
472                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
473                 WRITE_ONCE(perf_sample_allowed_ns, 0);
474         } else {
475                 update_perf_cpu_limits();
476         }
477
478         return 0;
479 }
480
481 /*
482  * perf samples are done in some very critical code paths (NMIs).
483  * If they take too much CPU time, the system can lock up and not
484  * get any real work done.  This will drop the sample rate when
485  * we detect that events are taking too long.
486  */
487 #define NR_ACCUMULATED_SAMPLES 128
488 static DEFINE_PER_CPU(u64, running_sample_length);
489
490 static u64 __report_avg;
491 static u64 __report_allowed;
492
493 static void perf_duration_warn(struct irq_work *w)
494 {
495         printk_ratelimited(KERN_INFO
496                 "perf: interrupt took too long (%lld > %lld), lowering "
497                 "kernel.perf_event_max_sample_rate to %d\n",
498                 __report_avg, __report_allowed,
499                 sysctl_perf_event_sample_rate);
500 }
501
502 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
503
504 void perf_sample_event_took(u64 sample_len_ns)
505 {
506         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
507         u64 running_len;
508         u64 avg_len;
509         u32 max;
510
511         if (max_len == 0)
512                 return;
513
514         /* Decay the counter by 1 average sample. */
515         running_len = __this_cpu_read(running_sample_length);
516         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
517         running_len += sample_len_ns;
518         __this_cpu_write(running_sample_length, running_len);
519
520         /*
521          * Note: this will be biased artifically low until we have
522          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
523          * from having to maintain a count.
524          */
525         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
526         if (avg_len <= max_len)
527                 return;
528
529         __report_avg = avg_len;
530         __report_allowed = max_len;
531
532         /*
533          * Compute a throttle threshold 25% below the current duration.
534          */
535         avg_len += avg_len / 4;
536         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
537         if (avg_len < max)
538                 max /= (u32)avg_len;
539         else
540                 max = 1;
541
542         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
543         WRITE_ONCE(max_samples_per_tick, max);
544
545         sysctl_perf_event_sample_rate = max * HZ;
546         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
547
548         if (!irq_work_queue(&perf_duration_work)) {
549                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
550                              "kernel.perf_event_max_sample_rate to %d\n",
551                              __report_avg, __report_allowed,
552                              sysctl_perf_event_sample_rate);
553         }
554 }
555
556 static atomic64_t perf_event_id;
557
558 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
559                               enum event_type_t event_type);
560
561 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
562                              enum event_type_t event_type,
563                              struct task_struct *task);
564
565 static void update_context_time(struct perf_event_context *ctx);
566 static u64 perf_event_time(struct perf_event *event);
567
568 void __weak perf_event_print_debug(void)        { }
569
570 extern __weak const char *perf_pmu_name(void)
571 {
572         return "pmu";
573 }
574
575 static inline u64 perf_clock(void)
576 {
577         return local_clock();
578 }
579
580 static inline u64 perf_event_clock(struct perf_event *event)
581 {
582         return event->clock();
583 }
584
585 #ifdef CONFIG_CGROUP_PERF
586
587 static inline bool
588 perf_cgroup_match(struct perf_event *event)
589 {
590         struct perf_event_context *ctx = event->ctx;
591         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
592
593         /* @event doesn't care about cgroup */
594         if (!event->cgrp)
595                 return true;
596
597         /* wants specific cgroup scope but @cpuctx isn't associated with any */
598         if (!cpuctx->cgrp)
599                 return false;
600
601         /*
602          * Cgroup scoping is recursive.  An event enabled for a cgroup is
603          * also enabled for all its descendant cgroups.  If @cpuctx's
604          * cgroup is a descendant of @event's (the test covers identity
605          * case), it's a match.
606          */
607         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
608                                     event->cgrp->css.cgroup);
609 }
610
611 static inline void perf_detach_cgroup(struct perf_event *event)
612 {
613         css_put(&event->cgrp->css);
614         event->cgrp = NULL;
615 }
616
617 static inline int is_cgroup_event(struct perf_event *event)
618 {
619         return event->cgrp != NULL;
620 }
621
622 static inline u64 perf_cgroup_event_time(struct perf_event *event)
623 {
624         struct perf_cgroup_info *t;
625
626         t = per_cpu_ptr(event->cgrp->info, event->cpu);
627         return t->time;
628 }
629
630 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
631 {
632         struct perf_cgroup_info *info;
633         u64 now;
634
635         now = perf_clock();
636
637         info = this_cpu_ptr(cgrp->info);
638
639         info->time += now - info->timestamp;
640         info->timestamp = now;
641 }
642
643 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
644 {
645         struct perf_cgroup *cgrp = cpuctx->cgrp;
646         struct cgroup_subsys_state *css;
647
648         if (cgrp) {
649                 for (css = &cgrp->css; css; css = css->parent) {
650                         cgrp = container_of(css, struct perf_cgroup, css);
651                         __update_cgrp_time(cgrp);
652                 }
653         }
654 }
655
656 static inline void update_cgrp_time_from_event(struct perf_event *event)
657 {
658         struct perf_cgroup *cgrp;
659
660         /*
661          * ensure we access cgroup data only when needed and
662          * when we know the cgroup is pinned (css_get)
663          */
664         if (!is_cgroup_event(event))
665                 return;
666
667         cgrp = perf_cgroup_from_task(current, event->ctx);
668         /*
669          * Do not update time when cgroup is not active
670          */
671        if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
672                 __update_cgrp_time(event->cgrp);
673 }
674
675 static inline void
676 perf_cgroup_set_timestamp(struct task_struct *task,
677                           struct perf_event_context *ctx)
678 {
679         struct perf_cgroup *cgrp;
680         struct perf_cgroup_info *info;
681         struct cgroup_subsys_state *css;
682
683         /*
684          * ctx->lock held by caller
685          * ensure we do not access cgroup data
686          * unless we have the cgroup pinned (css_get)
687          */
688         if (!task || !ctx->nr_cgroups)
689                 return;
690
691         cgrp = perf_cgroup_from_task(task, ctx);
692
693         for (css = &cgrp->css; css; css = css->parent) {
694                 cgrp = container_of(css, struct perf_cgroup, css);
695                 info = this_cpu_ptr(cgrp->info);
696                 info->timestamp = ctx->timestamp;
697         }
698 }
699
700 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
701
702 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
703 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
704
705 /*
706  * reschedule events based on the cgroup constraint of task.
707  *
708  * mode SWOUT : schedule out everything
709  * mode SWIN : schedule in based on cgroup for next
710  */
711 static void perf_cgroup_switch(struct task_struct *task, int mode)
712 {
713         struct perf_cpu_context *cpuctx;
714         struct list_head *list;
715         unsigned long flags;
716
717         /*
718          * Disable interrupts and preemption to avoid this CPU's
719          * cgrp_cpuctx_entry to change under us.
720          */
721         local_irq_save(flags);
722
723         list = this_cpu_ptr(&cgrp_cpuctx_list);
724         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
725                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
726
727                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
728                 perf_pmu_disable(cpuctx->ctx.pmu);
729
730                 if (mode & PERF_CGROUP_SWOUT) {
731                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
732                         /*
733                          * must not be done before ctxswout due
734                          * to event_filter_match() in event_sched_out()
735                          */
736                         cpuctx->cgrp = NULL;
737                 }
738
739                 if (mode & PERF_CGROUP_SWIN) {
740                         WARN_ON_ONCE(cpuctx->cgrp);
741                         /*
742                          * set cgrp before ctxsw in to allow
743                          * event_filter_match() to not have to pass
744                          * task around
745                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
746                          * because cgorup events are only per-cpu
747                          */
748                         cpuctx->cgrp = perf_cgroup_from_task(task,
749                                                              &cpuctx->ctx);
750                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
751                 }
752                 perf_pmu_enable(cpuctx->ctx.pmu);
753                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
754         }
755
756         local_irq_restore(flags);
757 }
758
759 static inline void perf_cgroup_sched_out(struct task_struct *task,
760                                          struct task_struct *next)
761 {
762         struct perf_cgroup *cgrp1;
763         struct perf_cgroup *cgrp2 = NULL;
764
765         rcu_read_lock();
766         /*
767          * we come here when we know perf_cgroup_events > 0
768          * we do not need to pass the ctx here because we know
769          * we are holding the rcu lock
770          */
771         cgrp1 = perf_cgroup_from_task(task, NULL);
772         cgrp2 = perf_cgroup_from_task(next, NULL);
773
774         /*
775          * only schedule out current cgroup events if we know
776          * that we are switching to a different cgroup. Otherwise,
777          * do no touch the cgroup events.
778          */
779         if (cgrp1 != cgrp2)
780                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
781
782         rcu_read_unlock();
783 }
784
785 static inline void perf_cgroup_sched_in(struct task_struct *prev,
786                                         struct task_struct *task)
787 {
788         struct perf_cgroup *cgrp1;
789         struct perf_cgroup *cgrp2 = NULL;
790
791         rcu_read_lock();
792         /*
793          * we come here when we know perf_cgroup_events > 0
794          * we do not need to pass the ctx here because we know
795          * we are holding the rcu lock
796          */
797         cgrp1 = perf_cgroup_from_task(task, NULL);
798         cgrp2 = perf_cgroup_from_task(prev, NULL);
799
800         /*
801          * only need to schedule in cgroup events if we are changing
802          * cgroup during ctxsw. Cgroup events were not scheduled
803          * out of ctxsw out if that was not the case.
804          */
805         if (cgrp1 != cgrp2)
806                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
807
808         rcu_read_unlock();
809 }
810
811 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
812                                       struct perf_event_attr *attr,
813                                       struct perf_event *group_leader)
814 {
815         struct perf_cgroup *cgrp;
816         struct cgroup_subsys_state *css;
817         struct fd f = fdget(fd);
818         int ret = 0;
819
820         if (!f.file)
821                 return -EBADF;
822
823         css = css_tryget_online_from_dir(f.file->f_path.dentry,
824                                          &perf_event_cgrp_subsys);
825         if (IS_ERR(css)) {
826                 ret = PTR_ERR(css);
827                 goto out;
828         }
829
830         cgrp = container_of(css, struct perf_cgroup, css);
831         event->cgrp = cgrp;
832
833         /*
834          * all events in a group must monitor
835          * the same cgroup because a task belongs
836          * to only one perf cgroup at a time
837          */
838         if (group_leader && group_leader->cgrp != cgrp) {
839                 perf_detach_cgroup(event);
840                 ret = -EINVAL;
841         }
842 out:
843         fdput(f);
844         return ret;
845 }
846
847 static inline void
848 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
849 {
850         struct perf_cgroup_info *t;
851         t = per_cpu_ptr(event->cgrp->info, event->cpu);
852         event->shadow_ctx_time = now - t->timestamp;
853 }
854
855 static inline void
856 perf_cgroup_defer_enabled(struct perf_event *event)
857 {
858         /*
859          * when the current task's perf cgroup does not match
860          * the event's, we need to remember to call the
861          * perf_mark_enable() function the first time a task with
862          * a matching perf cgroup is scheduled in.
863          */
864         if (is_cgroup_event(event) && !perf_cgroup_match(event))
865                 event->cgrp_defer_enabled = 1;
866 }
867
868 static inline void
869 perf_cgroup_mark_enabled(struct perf_event *event,
870                          struct perf_event_context *ctx)
871 {
872         struct perf_event *sub;
873         u64 tstamp = perf_event_time(event);
874
875         if (!event->cgrp_defer_enabled)
876                 return;
877
878         event->cgrp_defer_enabled = 0;
879
880         event->tstamp_enabled = tstamp - event->total_time_enabled;
881         list_for_each_entry(sub, &event->sibling_list, group_entry) {
882                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
883                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
884                         sub->cgrp_defer_enabled = 0;
885                 }
886         }
887 }
888
889 /*
890  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
891  * cleared when last cgroup event is removed.
892  */
893 static inline void
894 list_update_cgroup_event(struct perf_event *event,
895                          struct perf_event_context *ctx, bool add)
896 {
897         struct perf_cpu_context *cpuctx;
898         struct list_head *cpuctx_entry;
899
900         if (!is_cgroup_event(event))
901                 return;
902
903         if (add && ctx->nr_cgroups++)
904                 return;
905         else if (!add && --ctx->nr_cgroups)
906                 return;
907         /*
908          * Because cgroup events are always per-cpu events,
909          * this will always be called from the right CPU.
910          */
911         cpuctx = __get_cpu_context(ctx);
912         cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
913         /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
914         if (add) {
915                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
916
917                 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
918                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
919                         cpuctx->cgrp = cgrp;
920         } else {
921                 list_del(cpuctx_entry);
922                 cpuctx->cgrp = NULL;
923         }
924 }
925
926 #else /* !CONFIG_CGROUP_PERF */
927
928 static inline bool
929 perf_cgroup_match(struct perf_event *event)
930 {
931         return true;
932 }
933
934 static inline void perf_detach_cgroup(struct perf_event *event)
935 {}
936
937 static inline int is_cgroup_event(struct perf_event *event)
938 {
939         return 0;
940 }
941
942 static inline void update_cgrp_time_from_event(struct perf_event *event)
943 {
944 }
945
946 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
947 {
948 }
949
950 static inline void perf_cgroup_sched_out(struct task_struct *task,
951                                          struct task_struct *next)
952 {
953 }
954
955 static inline void perf_cgroup_sched_in(struct task_struct *prev,
956                                         struct task_struct *task)
957 {
958 }
959
960 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
961                                       struct perf_event_attr *attr,
962                                       struct perf_event *group_leader)
963 {
964         return -EINVAL;
965 }
966
967 static inline void
968 perf_cgroup_set_timestamp(struct task_struct *task,
969                           struct perf_event_context *ctx)
970 {
971 }
972
973 void
974 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
975 {
976 }
977
978 static inline void
979 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
980 {
981 }
982
983 static inline u64 perf_cgroup_event_time(struct perf_event *event)
984 {
985         return 0;
986 }
987
988 static inline void
989 perf_cgroup_defer_enabled(struct perf_event *event)
990 {
991 }
992
993 static inline void
994 perf_cgroup_mark_enabled(struct perf_event *event,
995                          struct perf_event_context *ctx)
996 {
997 }
998
999 static inline void
1000 list_update_cgroup_event(struct perf_event *event,
1001                          struct perf_event_context *ctx, bool add)
1002 {
1003 }
1004
1005 #endif
1006
1007 /*
1008  * set default to be dependent on timer tick just
1009  * like original code
1010  */
1011 #define PERF_CPU_HRTIMER (1000 / HZ)
1012 /*
1013  * function must be called with interrupts disabled
1014  */
1015 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1016 {
1017         struct perf_cpu_context *cpuctx;
1018         int rotations = 0;
1019
1020         WARN_ON(!irqs_disabled());
1021
1022         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1023         rotations = perf_rotate_context(cpuctx);
1024
1025         raw_spin_lock(&cpuctx->hrtimer_lock);
1026         if (rotations)
1027                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1028         else
1029                 cpuctx->hrtimer_active = 0;
1030         raw_spin_unlock(&cpuctx->hrtimer_lock);
1031
1032         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1033 }
1034
1035 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1036 {
1037         struct hrtimer *timer = &cpuctx->hrtimer;
1038         struct pmu *pmu = cpuctx->ctx.pmu;
1039         u64 interval;
1040
1041         /* no multiplexing needed for SW PMU */
1042         if (pmu->task_ctx_nr == perf_sw_context)
1043                 return;
1044
1045         /*
1046          * check default is sane, if not set then force to
1047          * default interval (1/tick)
1048          */
1049         interval = pmu->hrtimer_interval_ms;
1050         if (interval < 1)
1051                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1052
1053         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1054
1055         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1056         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1057         timer->function = perf_mux_hrtimer_handler;
1058 }
1059
1060 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1061 {
1062         struct hrtimer *timer = &cpuctx->hrtimer;
1063         struct pmu *pmu = cpuctx->ctx.pmu;
1064         unsigned long flags;
1065
1066         /* not for SW PMU */
1067         if (pmu->task_ctx_nr == perf_sw_context)
1068                 return 0;
1069
1070         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1071         if (!cpuctx->hrtimer_active) {
1072                 cpuctx->hrtimer_active = 1;
1073                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1074                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1075         }
1076         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1077
1078         return 0;
1079 }
1080
1081 void perf_pmu_disable(struct pmu *pmu)
1082 {
1083         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1084         if (!(*count)++)
1085                 pmu->pmu_disable(pmu);
1086 }
1087
1088 void perf_pmu_enable(struct pmu *pmu)
1089 {
1090         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1091         if (!--(*count))
1092                 pmu->pmu_enable(pmu);
1093 }
1094
1095 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1096
1097 /*
1098  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1099  * perf_event_task_tick() are fully serialized because they're strictly cpu
1100  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1101  * disabled, while perf_event_task_tick is called from IRQ context.
1102  */
1103 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1104 {
1105         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1106
1107         WARN_ON(!irqs_disabled());
1108
1109         WARN_ON(!list_empty(&ctx->active_ctx_list));
1110
1111         list_add(&ctx->active_ctx_list, head);
1112 }
1113
1114 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1115 {
1116         WARN_ON(!irqs_disabled());
1117
1118         WARN_ON(list_empty(&ctx->active_ctx_list));
1119
1120         list_del_init(&ctx->active_ctx_list);
1121 }
1122
1123 static void get_ctx(struct perf_event_context *ctx)
1124 {
1125         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1126 }
1127
1128 static void free_ctx(struct rcu_head *head)
1129 {
1130         struct perf_event_context *ctx;
1131
1132         ctx = container_of(head, struct perf_event_context, rcu_head);
1133         kfree(ctx->task_ctx_data);
1134         kfree(ctx);
1135 }
1136
1137 static void put_ctx(struct perf_event_context *ctx)
1138 {
1139         if (atomic_dec_and_test(&ctx->refcount)) {
1140                 if (ctx->parent_ctx)
1141                         put_ctx(ctx->parent_ctx);
1142                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1143                         put_task_struct(ctx->task);
1144                 call_rcu(&ctx->rcu_head, free_ctx);
1145         }
1146 }
1147
1148 /*
1149  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1150  * perf_pmu_migrate_context() we need some magic.
1151  *
1152  * Those places that change perf_event::ctx will hold both
1153  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1154  *
1155  * Lock ordering is by mutex address. There are two other sites where
1156  * perf_event_context::mutex nests and those are:
1157  *
1158  *  - perf_event_exit_task_context()    [ child , 0 ]
1159  *      perf_event_exit_event()
1160  *        put_event()                   [ parent, 1 ]
1161  *
1162  *  - perf_event_init_context()         [ parent, 0 ]
1163  *      inherit_task_group()
1164  *        inherit_group()
1165  *          inherit_event()
1166  *            perf_event_alloc()
1167  *              perf_init_event()
1168  *                perf_try_init_event() [ child , 1 ]
1169  *
1170  * While it appears there is an obvious deadlock here -- the parent and child
1171  * nesting levels are inverted between the two. This is in fact safe because
1172  * life-time rules separate them. That is an exiting task cannot fork, and a
1173  * spawning task cannot (yet) exit.
1174  *
1175  * But remember that that these are parent<->child context relations, and
1176  * migration does not affect children, therefore these two orderings should not
1177  * interact.
1178  *
1179  * The change in perf_event::ctx does not affect children (as claimed above)
1180  * because the sys_perf_event_open() case will install a new event and break
1181  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1182  * concerned with cpuctx and that doesn't have children.
1183  *
1184  * The places that change perf_event::ctx will issue:
1185  *
1186  *   perf_remove_from_context();
1187  *   synchronize_rcu();
1188  *   perf_install_in_context();
1189  *
1190  * to affect the change. The remove_from_context() + synchronize_rcu() should
1191  * quiesce the event, after which we can install it in the new location. This
1192  * means that only external vectors (perf_fops, prctl) can perturb the event
1193  * while in transit. Therefore all such accessors should also acquire
1194  * perf_event_context::mutex to serialize against this.
1195  *
1196  * However; because event->ctx can change while we're waiting to acquire
1197  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1198  * function.
1199  *
1200  * Lock order:
1201  *    cred_guard_mutex
1202  *      task_struct::perf_event_mutex
1203  *        perf_event_context::mutex
1204  *          perf_event::child_mutex;
1205  *            perf_event_context::lock
1206  *          perf_event::mmap_mutex
1207  *          mmap_sem
1208  */
1209 static struct perf_event_context *
1210 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1211 {
1212         struct perf_event_context *ctx;
1213
1214 again:
1215         rcu_read_lock();
1216         ctx = ACCESS_ONCE(event->ctx);
1217         if (!atomic_inc_not_zero(&ctx->refcount)) {
1218                 rcu_read_unlock();
1219                 goto again;
1220         }
1221         rcu_read_unlock();
1222
1223         mutex_lock_nested(&ctx->mutex, nesting);
1224         if (event->ctx != ctx) {
1225                 mutex_unlock(&ctx->mutex);
1226                 put_ctx(ctx);
1227                 goto again;
1228         }
1229
1230         return ctx;
1231 }
1232
1233 static inline struct perf_event_context *
1234 perf_event_ctx_lock(struct perf_event *event)
1235 {
1236         return perf_event_ctx_lock_nested(event, 0);
1237 }
1238
1239 static void perf_event_ctx_unlock(struct perf_event *event,
1240                                   struct perf_event_context *ctx)
1241 {
1242         mutex_unlock(&ctx->mutex);
1243         put_ctx(ctx);
1244 }
1245
1246 /*
1247  * This must be done under the ctx->lock, such as to serialize against
1248  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1249  * calling scheduler related locks and ctx->lock nests inside those.
1250  */
1251 static __must_check struct perf_event_context *
1252 unclone_ctx(struct perf_event_context *ctx)
1253 {
1254         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1255
1256         lockdep_assert_held(&ctx->lock);
1257
1258         if (parent_ctx)
1259                 ctx->parent_ctx = NULL;
1260         ctx->generation++;
1261
1262         return parent_ctx;
1263 }
1264
1265 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1266                                 enum pid_type type)
1267 {
1268         u32 nr;
1269         /*
1270          * only top level events have the pid namespace they were created in
1271          */
1272         if (event->parent)
1273                 event = event->parent;
1274
1275         nr = __task_pid_nr_ns(p, type, event->ns);
1276         /* avoid -1 if it is idle thread or runs in another ns */
1277         if (!nr && !pid_alive(p))
1278                 nr = -1;
1279         return nr;
1280 }
1281
1282 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1283 {
1284         return perf_event_pid_type(event, p, __PIDTYPE_TGID);
1285 }
1286
1287 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1288 {
1289         return perf_event_pid_type(event, p, PIDTYPE_PID);
1290 }
1291
1292 /*
1293  * If we inherit events we want to return the parent event id
1294  * to userspace.
1295  */
1296 static u64 primary_event_id(struct perf_event *event)
1297 {
1298         u64 id = event->id;
1299
1300         if (event->parent)
1301                 id = event->parent->id;
1302
1303         return id;
1304 }
1305
1306 /*
1307  * Get the perf_event_context for a task and lock it.
1308  *
1309  * This has to cope with with the fact that until it is locked,
1310  * the context could get moved to another task.
1311  */
1312 static struct perf_event_context *
1313 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1314 {
1315         struct perf_event_context *ctx;
1316
1317 retry:
1318         /*
1319          * One of the few rules of preemptible RCU is that one cannot do
1320          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1321          * part of the read side critical section was irqs-enabled -- see
1322          * rcu_read_unlock_special().
1323          *
1324          * Since ctx->lock nests under rq->lock we must ensure the entire read
1325          * side critical section has interrupts disabled.
1326          */
1327         local_irq_save(*flags);
1328         rcu_read_lock();
1329         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1330         if (ctx) {
1331                 /*
1332                  * If this context is a clone of another, it might
1333                  * get swapped for another underneath us by
1334                  * perf_event_task_sched_out, though the
1335                  * rcu_read_lock() protects us from any context
1336                  * getting freed.  Lock the context and check if it
1337                  * got swapped before we could get the lock, and retry
1338                  * if so.  If we locked the right context, then it
1339                  * can't get swapped on us any more.
1340                  */
1341                 raw_spin_lock(&ctx->lock);
1342                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1343                         raw_spin_unlock(&ctx->lock);
1344                         rcu_read_unlock();
1345                         local_irq_restore(*flags);
1346                         goto retry;
1347                 }
1348
1349                 if (ctx->task == TASK_TOMBSTONE ||
1350                     !atomic_inc_not_zero(&ctx->refcount)) {
1351                         raw_spin_unlock(&ctx->lock);
1352                         ctx = NULL;
1353                 } else {
1354                         WARN_ON_ONCE(ctx->task != task);
1355                 }
1356         }
1357         rcu_read_unlock();
1358         if (!ctx)
1359                 local_irq_restore(*flags);
1360         return ctx;
1361 }
1362
1363 /*
1364  * Get the context for a task and increment its pin_count so it
1365  * can't get swapped to another task.  This also increments its
1366  * reference count so that the context can't get freed.
1367  */
1368 static struct perf_event_context *
1369 perf_pin_task_context(struct task_struct *task, int ctxn)
1370 {
1371         struct perf_event_context *ctx;
1372         unsigned long flags;
1373
1374         ctx = perf_lock_task_context(task, ctxn, &flags);
1375         if (ctx) {
1376                 ++ctx->pin_count;
1377                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1378         }
1379         return ctx;
1380 }
1381
1382 static void perf_unpin_context(struct perf_event_context *ctx)
1383 {
1384         unsigned long flags;
1385
1386         raw_spin_lock_irqsave(&ctx->lock, flags);
1387         --ctx->pin_count;
1388         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1389 }
1390
1391 /*
1392  * Update the record of the current time in a context.
1393  */
1394 static void update_context_time(struct perf_event_context *ctx)
1395 {
1396         u64 now = perf_clock();
1397
1398         ctx->time += now - ctx->timestamp;
1399         ctx->timestamp = now;
1400 }
1401
1402 static u64 perf_event_time(struct perf_event *event)
1403 {
1404         struct perf_event_context *ctx = event->ctx;
1405
1406         if (is_cgroup_event(event))
1407                 return perf_cgroup_event_time(event);
1408
1409         return ctx ? ctx->time : 0;
1410 }
1411
1412 /*
1413  * Update the total_time_enabled and total_time_running fields for a event.
1414  */
1415 static void update_event_times(struct perf_event *event)
1416 {
1417         struct perf_event_context *ctx = event->ctx;
1418         u64 run_end;
1419
1420         lockdep_assert_held(&ctx->lock);
1421
1422         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1423             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1424                 return;
1425
1426         /*
1427          * in cgroup mode, time_enabled represents
1428          * the time the event was enabled AND active
1429          * tasks were in the monitored cgroup. This is
1430          * independent of the activity of the context as
1431          * there may be a mix of cgroup and non-cgroup events.
1432          *
1433          * That is why we treat cgroup events differently
1434          * here.
1435          */
1436         if (is_cgroup_event(event))
1437                 run_end = perf_cgroup_event_time(event);
1438         else if (ctx->is_active)
1439                 run_end = ctx->time;
1440         else
1441                 run_end = event->tstamp_stopped;
1442
1443         event->total_time_enabled = run_end - event->tstamp_enabled;
1444
1445         if (event->state == PERF_EVENT_STATE_INACTIVE)
1446                 run_end = event->tstamp_stopped;
1447         else
1448                 run_end = perf_event_time(event);
1449
1450         event->total_time_running = run_end - event->tstamp_running;
1451
1452 }
1453
1454 /*
1455  * Update total_time_enabled and total_time_running for all events in a group.
1456  */
1457 static void update_group_times(struct perf_event *leader)
1458 {
1459         struct perf_event *event;
1460
1461         update_event_times(leader);
1462         list_for_each_entry(event, &leader->sibling_list, group_entry)
1463                 update_event_times(event);
1464 }
1465
1466 static enum event_type_t get_event_type(struct perf_event *event)
1467 {
1468         struct perf_event_context *ctx = event->ctx;
1469         enum event_type_t event_type;
1470
1471         lockdep_assert_held(&ctx->lock);
1472
1473         /*
1474          * It's 'group type', really, because if our group leader is
1475          * pinned, so are we.
1476          */
1477         if (event->group_leader != event)
1478                 event = event->group_leader;
1479
1480         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1481         if (!ctx->task)
1482                 event_type |= EVENT_CPU;
1483
1484         return event_type;
1485 }
1486
1487 static struct list_head *
1488 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1489 {
1490         if (event->attr.pinned)
1491                 return &ctx->pinned_groups;
1492         else
1493                 return &ctx->flexible_groups;
1494 }
1495
1496 /*
1497  * Add a event from the lists for its context.
1498  * Must be called with ctx->mutex and ctx->lock held.
1499  */
1500 static void
1501 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1502 {
1503         lockdep_assert_held(&ctx->lock);
1504
1505         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1506         event->attach_state |= PERF_ATTACH_CONTEXT;
1507
1508         /*
1509          * If we're a stand alone event or group leader, we go to the context
1510          * list, group events are kept attached to the group so that
1511          * perf_group_detach can, at all times, locate all siblings.
1512          */
1513         if (event->group_leader == event) {
1514                 struct list_head *list;
1515
1516                 event->group_caps = event->event_caps;
1517
1518                 list = ctx_group_list(event, ctx);
1519                 list_add_tail(&event->group_entry, list);
1520         }
1521
1522         list_update_cgroup_event(event, ctx, true);
1523
1524         list_add_rcu(&event->event_entry, &ctx->event_list);
1525         ctx->nr_events++;
1526         if (event->attr.inherit_stat)
1527                 ctx->nr_stat++;
1528
1529         ctx->generation++;
1530 }
1531
1532 /*
1533  * Initialize event state based on the perf_event_attr::disabled.
1534  */
1535 static inline void perf_event__state_init(struct perf_event *event)
1536 {
1537         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1538                                               PERF_EVENT_STATE_INACTIVE;
1539 }
1540
1541 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1542 {
1543         int entry = sizeof(u64); /* value */
1544         int size = 0;
1545         int nr = 1;
1546
1547         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1548                 size += sizeof(u64);
1549
1550         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1551                 size += sizeof(u64);
1552
1553         if (event->attr.read_format & PERF_FORMAT_ID)
1554                 entry += sizeof(u64);
1555
1556         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1557                 nr += nr_siblings;
1558                 size += sizeof(u64);
1559         }
1560
1561         size += entry * nr;
1562         event->read_size = size;
1563 }
1564
1565 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1566 {
1567         struct perf_sample_data *data;
1568         u16 size = 0;
1569
1570         if (sample_type & PERF_SAMPLE_IP)
1571                 size += sizeof(data->ip);
1572
1573         if (sample_type & PERF_SAMPLE_ADDR)
1574                 size += sizeof(data->addr);
1575
1576         if (sample_type & PERF_SAMPLE_PERIOD)
1577                 size += sizeof(data->period);
1578
1579         if (sample_type & PERF_SAMPLE_WEIGHT)
1580                 size += sizeof(data->weight);
1581
1582         if (sample_type & PERF_SAMPLE_READ)
1583                 size += event->read_size;
1584
1585         if (sample_type & PERF_SAMPLE_DATA_SRC)
1586                 size += sizeof(data->data_src.val);
1587
1588         if (sample_type & PERF_SAMPLE_TRANSACTION)
1589                 size += sizeof(data->txn);
1590
1591         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1592                 size += sizeof(data->phys_addr);
1593
1594         event->header_size = size;
1595 }
1596
1597 /*
1598  * Called at perf_event creation and when events are attached/detached from a
1599  * group.
1600  */
1601 static void perf_event__header_size(struct perf_event *event)
1602 {
1603         __perf_event_read_size(event,
1604                                event->group_leader->nr_siblings);
1605         __perf_event_header_size(event, event->attr.sample_type);
1606 }
1607
1608 static void perf_event__id_header_size(struct perf_event *event)
1609 {
1610         struct perf_sample_data *data;
1611         u64 sample_type = event->attr.sample_type;
1612         u16 size = 0;
1613
1614         if (sample_type & PERF_SAMPLE_TID)
1615                 size += sizeof(data->tid_entry);
1616
1617         if (sample_type & PERF_SAMPLE_TIME)
1618                 size += sizeof(data->time);
1619
1620         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1621                 size += sizeof(data->id);
1622
1623         if (sample_type & PERF_SAMPLE_ID)
1624                 size += sizeof(data->id);
1625
1626         if (sample_type & PERF_SAMPLE_STREAM_ID)
1627                 size += sizeof(data->stream_id);
1628
1629         if (sample_type & PERF_SAMPLE_CPU)
1630                 size += sizeof(data->cpu_entry);
1631
1632         event->id_header_size = size;
1633 }
1634
1635 static bool perf_event_validate_size(struct perf_event *event)
1636 {
1637         /*
1638          * The values computed here will be over-written when we actually
1639          * attach the event.
1640          */
1641         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1642         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1643         perf_event__id_header_size(event);
1644
1645         /*
1646          * Sum the lot; should not exceed the 64k limit we have on records.
1647          * Conservative limit to allow for callchains and other variable fields.
1648          */
1649         if (event->read_size + event->header_size +
1650             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1651                 return false;
1652
1653         return true;
1654 }
1655
1656 static void perf_group_attach(struct perf_event *event)
1657 {
1658         struct perf_event *group_leader = event->group_leader, *pos;
1659
1660         lockdep_assert_held(&event->ctx->lock);
1661
1662         /*
1663          * We can have double attach due to group movement in perf_event_open.
1664          */
1665         if (event->attach_state & PERF_ATTACH_GROUP)
1666                 return;
1667
1668         event->attach_state |= PERF_ATTACH_GROUP;
1669
1670         if (group_leader == event)
1671                 return;
1672
1673         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1674
1675         group_leader->group_caps &= event->event_caps;
1676
1677         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1678         group_leader->nr_siblings++;
1679
1680         perf_event__header_size(group_leader);
1681
1682         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1683                 perf_event__header_size(pos);
1684 }
1685
1686 /*
1687  * Remove a event from the lists for its context.
1688  * Must be called with ctx->mutex and ctx->lock held.
1689  */
1690 static void
1691 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1692 {
1693         WARN_ON_ONCE(event->ctx != ctx);
1694         lockdep_assert_held(&ctx->lock);
1695
1696         /*
1697          * We can have double detach due to exit/hot-unplug + close.
1698          */
1699         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1700                 return;
1701
1702         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1703
1704         list_update_cgroup_event(event, ctx, false);
1705
1706         ctx->nr_events--;
1707         if (event->attr.inherit_stat)
1708                 ctx->nr_stat--;
1709
1710         list_del_rcu(&event->event_entry);
1711
1712         if (event->group_leader == event)
1713                 list_del_init(&event->group_entry);
1714
1715         update_group_times(event);
1716
1717         /*
1718          * If event was in error state, then keep it
1719          * that way, otherwise bogus counts will be
1720          * returned on read(). The only way to get out
1721          * of error state is by explicit re-enabling
1722          * of the event
1723          */
1724         if (event->state > PERF_EVENT_STATE_OFF)
1725                 event->state = PERF_EVENT_STATE_OFF;
1726
1727         ctx->generation++;
1728 }
1729
1730 static void perf_group_detach(struct perf_event *event)
1731 {
1732         struct perf_event *sibling, *tmp;
1733         struct list_head *list = NULL;
1734
1735         lockdep_assert_held(&event->ctx->lock);
1736
1737         /*
1738          * We can have double detach due to exit/hot-unplug + close.
1739          */
1740         if (!(event->attach_state & PERF_ATTACH_GROUP))
1741                 return;
1742
1743         event->attach_state &= ~PERF_ATTACH_GROUP;
1744
1745         /*
1746          * If this is a sibling, remove it from its group.
1747          */
1748         if (event->group_leader != event) {
1749                 list_del_init(&event->group_entry);
1750                 event->group_leader->nr_siblings--;
1751                 goto out;
1752         }
1753
1754         if (!list_empty(&event->group_entry))
1755                 list = &event->group_entry;
1756
1757         /*
1758          * If this was a group event with sibling events then
1759          * upgrade the siblings to singleton events by adding them
1760          * to whatever list we are on.
1761          */
1762         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1763                 if (list)
1764                         list_move_tail(&sibling->group_entry, list);
1765                 sibling->group_leader = sibling;
1766
1767                 /* Inherit group flags from the previous leader */
1768                 sibling->group_caps = event->group_caps;
1769
1770                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1771         }
1772
1773 out:
1774         perf_event__header_size(event->group_leader);
1775
1776         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1777                 perf_event__header_size(tmp);
1778 }
1779
1780 static bool is_orphaned_event(struct perf_event *event)
1781 {
1782         return event->state == PERF_EVENT_STATE_DEAD;
1783 }
1784
1785 static inline int __pmu_filter_match(struct perf_event *event)
1786 {
1787         struct pmu *pmu = event->pmu;
1788         return pmu->filter_match ? pmu->filter_match(event) : 1;
1789 }
1790
1791 /*
1792  * Check whether we should attempt to schedule an event group based on
1793  * PMU-specific filtering. An event group can consist of HW and SW events,
1794  * potentially with a SW leader, so we must check all the filters, to
1795  * determine whether a group is schedulable:
1796  */
1797 static inline int pmu_filter_match(struct perf_event *event)
1798 {
1799         struct perf_event *child;
1800
1801         if (!__pmu_filter_match(event))
1802                 return 0;
1803
1804         list_for_each_entry(child, &event->sibling_list, group_entry) {
1805                 if (!__pmu_filter_match(child))
1806                         return 0;
1807         }
1808
1809         return 1;
1810 }
1811
1812 static inline int
1813 event_filter_match(struct perf_event *event)
1814 {
1815         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1816                perf_cgroup_match(event) && pmu_filter_match(event);
1817 }
1818
1819 static void
1820 event_sched_out(struct perf_event *event,
1821                   struct perf_cpu_context *cpuctx,
1822                   struct perf_event_context *ctx)
1823 {
1824         u64 tstamp = perf_event_time(event);
1825         u64 delta;
1826
1827         WARN_ON_ONCE(event->ctx != ctx);
1828         lockdep_assert_held(&ctx->lock);
1829
1830         /*
1831          * An event which could not be activated because of
1832          * filter mismatch still needs to have its timings
1833          * maintained, otherwise bogus information is return
1834          * via read() for time_enabled, time_running:
1835          */
1836         if (event->state == PERF_EVENT_STATE_INACTIVE &&
1837             !event_filter_match(event)) {
1838                 delta = tstamp - event->tstamp_stopped;
1839                 event->tstamp_running += delta;
1840                 event->tstamp_stopped = tstamp;
1841         }
1842
1843         if (event->state != PERF_EVENT_STATE_ACTIVE)
1844                 return;
1845
1846         perf_pmu_disable(event->pmu);
1847
1848         event->tstamp_stopped = tstamp;
1849         event->pmu->del(event, 0);
1850         event->oncpu = -1;
1851         event->state = PERF_EVENT_STATE_INACTIVE;
1852         if (event->pending_disable) {
1853                 event->pending_disable = 0;
1854                 event->state = PERF_EVENT_STATE_OFF;
1855         }
1856
1857         if (!is_software_event(event))
1858                 cpuctx->active_oncpu--;
1859         if (!--ctx->nr_active)
1860                 perf_event_ctx_deactivate(ctx);
1861         if (event->attr.freq && event->attr.sample_freq)
1862                 ctx->nr_freq--;
1863         if (event->attr.exclusive || !cpuctx->active_oncpu)
1864                 cpuctx->exclusive = 0;
1865
1866         perf_pmu_enable(event->pmu);
1867 }
1868
1869 static void
1870 group_sched_out(struct perf_event *group_event,
1871                 struct perf_cpu_context *cpuctx,
1872                 struct perf_event_context *ctx)
1873 {
1874         struct perf_event *event;
1875         int state = group_event->state;
1876
1877         perf_pmu_disable(ctx->pmu);
1878
1879         event_sched_out(group_event, cpuctx, ctx);
1880
1881         /*
1882          * Schedule out siblings (if any):
1883          */
1884         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1885                 event_sched_out(event, cpuctx, ctx);
1886
1887         perf_pmu_enable(ctx->pmu);
1888
1889         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1890                 cpuctx->exclusive = 0;
1891 }
1892
1893 #define DETACH_GROUP    0x01UL
1894
1895 /*
1896  * Cross CPU call to remove a performance event
1897  *
1898  * We disable the event on the hardware level first. After that we
1899  * remove it from the context list.
1900  */
1901 static void
1902 __perf_remove_from_context(struct perf_event *event,
1903                            struct perf_cpu_context *cpuctx,
1904                            struct perf_event_context *ctx,
1905                            void *info)
1906 {
1907         unsigned long flags = (unsigned long)info;
1908
1909         event_sched_out(event, cpuctx, ctx);
1910         if (flags & DETACH_GROUP)
1911                 perf_group_detach(event);
1912         list_del_event(event, ctx);
1913
1914         if (!ctx->nr_events && ctx->is_active) {
1915                 ctx->is_active = 0;
1916                 if (ctx->task) {
1917                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1918                         cpuctx->task_ctx = NULL;
1919                 }
1920         }
1921 }
1922
1923 /*
1924  * Remove the event from a task's (or a CPU's) list of events.
1925  *
1926  * If event->ctx is a cloned context, callers must make sure that
1927  * every task struct that event->ctx->task could possibly point to
1928  * remains valid.  This is OK when called from perf_release since
1929  * that only calls us on the top-level context, which can't be a clone.
1930  * When called from perf_event_exit_task, it's OK because the
1931  * context has been detached from its task.
1932  */
1933 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1934 {
1935         struct perf_event_context *ctx = event->ctx;
1936
1937         lockdep_assert_held(&ctx->mutex);
1938
1939         event_function_call(event, __perf_remove_from_context, (void *)flags);
1940
1941         /*
1942          * The above event_function_call() can NO-OP when it hits
1943          * TASK_TOMBSTONE. In that case we must already have been detached
1944          * from the context (by perf_event_exit_event()) but the grouping
1945          * might still be in-tact.
1946          */
1947         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1948         if ((flags & DETACH_GROUP) &&
1949             (event->attach_state & PERF_ATTACH_GROUP)) {
1950                 /*
1951                  * Since in that case we cannot possibly be scheduled, simply
1952                  * detach now.
1953                  */
1954                 raw_spin_lock_irq(&ctx->lock);
1955                 perf_group_detach(event);
1956                 raw_spin_unlock_irq(&ctx->lock);
1957         }
1958 }
1959
1960 /*
1961  * Cross CPU call to disable a performance event
1962  */
1963 static void __perf_event_disable(struct perf_event *event,
1964                                  struct perf_cpu_context *cpuctx,
1965                                  struct perf_event_context *ctx,
1966                                  void *info)
1967 {
1968         if (event->state < PERF_EVENT_STATE_INACTIVE)
1969                 return;
1970
1971         update_context_time(ctx);
1972         update_cgrp_time_from_event(event);
1973         update_group_times(event);
1974         if (event == event->group_leader)
1975                 group_sched_out(event, cpuctx, ctx);
1976         else
1977                 event_sched_out(event, cpuctx, ctx);
1978         event->state = PERF_EVENT_STATE_OFF;
1979 }
1980
1981 /*
1982  * Disable a event.
1983  *
1984  * If event->ctx is a cloned context, callers must make sure that
1985  * every task struct that event->ctx->task could possibly point to
1986  * remains valid.  This condition is satisifed when called through
1987  * perf_event_for_each_child or perf_event_for_each because they
1988  * hold the top-level event's child_mutex, so any descendant that
1989  * goes to exit will block in perf_event_exit_event().
1990  *
1991  * When called from perf_pending_event it's OK because event->ctx
1992  * is the current context on this CPU and preemption is disabled,
1993  * hence we can't get into perf_event_task_sched_out for this context.
1994  */
1995 static void _perf_event_disable(struct perf_event *event)
1996 {
1997         struct perf_event_context *ctx = event->ctx;
1998
1999         raw_spin_lock_irq(&ctx->lock);
2000         if (event->state <= PERF_EVENT_STATE_OFF) {
2001                 raw_spin_unlock_irq(&ctx->lock);
2002                 return;
2003         }
2004         raw_spin_unlock_irq(&ctx->lock);
2005
2006         event_function_call(event, __perf_event_disable, NULL);
2007 }
2008
2009 void perf_event_disable_local(struct perf_event *event)
2010 {
2011         event_function_local(event, __perf_event_disable, NULL);
2012 }
2013
2014 /*
2015  * Strictly speaking kernel users cannot create groups and therefore this
2016  * interface does not need the perf_event_ctx_lock() magic.
2017  */
2018 void perf_event_disable(struct perf_event *event)
2019 {
2020         struct perf_event_context *ctx;
2021
2022         ctx = perf_event_ctx_lock(event);
2023         _perf_event_disable(event);
2024         perf_event_ctx_unlock(event, ctx);
2025 }
2026 EXPORT_SYMBOL_GPL(perf_event_disable);
2027
2028 void perf_event_disable_inatomic(struct perf_event *event)
2029 {
2030         event->pending_disable = 1;
2031         irq_work_queue(&event->pending);
2032 }
2033
2034 static void perf_set_shadow_time(struct perf_event *event,
2035                                  struct perf_event_context *ctx,
2036                                  u64 tstamp)
2037 {
2038         /*
2039          * use the correct time source for the time snapshot
2040          *
2041          * We could get by without this by leveraging the
2042          * fact that to get to this function, the caller
2043          * has most likely already called update_context_time()
2044          * and update_cgrp_time_xx() and thus both timestamp
2045          * are identical (or very close). Given that tstamp is,
2046          * already adjusted for cgroup, we could say that:
2047          *    tstamp - ctx->timestamp
2048          * is equivalent to
2049          *    tstamp - cgrp->timestamp.
2050          *
2051          * Then, in perf_output_read(), the calculation would
2052          * work with no changes because:
2053          * - event is guaranteed scheduled in
2054          * - no scheduled out in between
2055          * - thus the timestamp would be the same
2056          *
2057          * But this is a bit hairy.
2058          *
2059          * So instead, we have an explicit cgroup call to remain
2060          * within the time time source all along. We believe it
2061          * is cleaner and simpler to understand.
2062          */
2063         if (is_cgroup_event(event))
2064                 perf_cgroup_set_shadow_time(event, tstamp);
2065         else
2066                 event->shadow_ctx_time = tstamp - ctx->timestamp;
2067 }
2068
2069 #define MAX_INTERRUPTS (~0ULL)
2070
2071 static void perf_log_throttle(struct perf_event *event, int enable);
2072 static void perf_log_itrace_start(struct perf_event *event);
2073
2074 static int
2075 event_sched_in(struct perf_event *event,
2076                  struct perf_cpu_context *cpuctx,
2077                  struct perf_event_context *ctx)
2078 {
2079         u64 tstamp = perf_event_time(event);
2080         int ret = 0;
2081
2082         lockdep_assert_held(&ctx->lock);
2083
2084         if (event->state <= PERF_EVENT_STATE_OFF)
2085                 return 0;
2086
2087         WRITE_ONCE(event->oncpu, smp_processor_id());
2088         /*
2089          * Order event::oncpu write to happen before the ACTIVE state
2090          * is visible.
2091          */
2092         smp_wmb();
2093         WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2094
2095         /*
2096          * Unthrottle events, since we scheduled we might have missed several
2097          * ticks already, also for a heavily scheduling task there is little
2098          * guarantee it'll get a tick in a timely manner.
2099          */
2100         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2101                 perf_log_throttle(event, 1);
2102                 event->hw.interrupts = 0;
2103         }
2104
2105         /*
2106          * The new state must be visible before we turn it on in the hardware:
2107          */
2108         smp_wmb();
2109
2110         perf_pmu_disable(event->pmu);
2111
2112         perf_set_shadow_time(event, ctx, tstamp);
2113
2114         perf_log_itrace_start(event);
2115
2116         if (event->pmu->add(event, PERF_EF_START)) {
2117                 event->state = PERF_EVENT_STATE_INACTIVE;
2118                 event->oncpu = -1;
2119                 ret = -EAGAIN;
2120                 goto out;
2121         }
2122
2123         event->tstamp_running += tstamp - event->tstamp_stopped;
2124
2125         if (!is_software_event(event))
2126                 cpuctx->active_oncpu++;
2127         if (!ctx->nr_active++)
2128                 perf_event_ctx_activate(ctx);
2129         if (event->attr.freq && event->attr.sample_freq)
2130                 ctx->nr_freq++;
2131
2132         if (event->attr.exclusive)
2133                 cpuctx->exclusive = 1;
2134
2135 out:
2136         perf_pmu_enable(event->pmu);
2137
2138         return ret;
2139 }
2140
2141 static int
2142 group_sched_in(struct perf_event *group_event,
2143                struct perf_cpu_context *cpuctx,
2144                struct perf_event_context *ctx)
2145 {
2146         struct perf_event *event, *partial_group = NULL;
2147         struct pmu *pmu = ctx->pmu;
2148         u64 now = ctx->time;
2149         bool simulate = false;
2150
2151         if (group_event->state == PERF_EVENT_STATE_OFF)
2152                 return 0;
2153
2154         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2155
2156         if (event_sched_in(group_event, cpuctx, ctx)) {
2157                 pmu->cancel_txn(pmu);
2158                 perf_mux_hrtimer_restart(cpuctx);
2159                 return -EAGAIN;
2160         }
2161
2162         /*
2163          * Schedule in siblings as one group (if any):
2164          */
2165         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2166                 if (event_sched_in(event, cpuctx, ctx)) {
2167                         partial_group = event;
2168                         goto group_error;
2169                 }
2170         }
2171
2172         if (!pmu->commit_txn(pmu))
2173                 return 0;
2174
2175 group_error:
2176         /*
2177          * Groups can be scheduled in as one unit only, so undo any
2178          * partial group before returning:
2179          * The events up to the failed event are scheduled out normally,
2180          * tstamp_stopped will be updated.
2181          *
2182          * The failed events and the remaining siblings need to have
2183          * their timings updated as if they had gone thru event_sched_in()
2184          * and event_sched_out(). This is required to get consistent timings
2185          * across the group. This also takes care of the case where the group
2186          * could never be scheduled by ensuring tstamp_stopped is set to mark
2187          * the time the event was actually stopped, such that time delta
2188          * calculation in update_event_times() is correct.
2189          */
2190         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2191                 if (event == partial_group)
2192                         simulate = true;
2193
2194                 if (simulate) {
2195                         event->tstamp_running += now - event->tstamp_stopped;
2196                         event->tstamp_stopped = now;
2197                 } else {
2198                         event_sched_out(event, cpuctx, ctx);
2199                 }
2200         }
2201         event_sched_out(group_event, cpuctx, ctx);
2202
2203         pmu->cancel_txn(pmu);
2204
2205         perf_mux_hrtimer_restart(cpuctx);
2206
2207         return -EAGAIN;
2208 }
2209
2210 /*
2211  * Work out whether we can put this event group on the CPU now.
2212  */
2213 static int group_can_go_on(struct perf_event *event,
2214                            struct perf_cpu_context *cpuctx,
2215                            int can_add_hw)
2216 {
2217         /*
2218          * Groups consisting entirely of software events can always go on.
2219          */
2220         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2221                 return 1;
2222         /*
2223          * If an exclusive group is already on, no other hardware
2224          * events can go on.
2225          */
2226         if (cpuctx->exclusive)
2227                 return 0;
2228         /*
2229          * If this group is exclusive and there are already
2230          * events on the CPU, it can't go on.
2231          */
2232         if (event->attr.exclusive && cpuctx->active_oncpu)
2233                 return 0;
2234         /*
2235          * Otherwise, try to add it if all previous groups were able
2236          * to go on.
2237          */
2238         return can_add_hw;
2239 }
2240
2241 /*
2242  * Complement to update_event_times(). This computes the tstamp_* values to
2243  * continue 'enabled' state from @now, and effectively discards the time
2244  * between the prior tstamp_stopped and now (as we were in the OFF state, or
2245  * just switched (context) time base).
2246  *
2247  * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
2248  * cannot have been scheduled in yet. And going into INACTIVE state means
2249  * '@event->tstamp_stopped = @now'.
2250  *
2251  * Thus given the rules of update_event_times():
2252  *
2253  *   total_time_enabled = tstamp_stopped - tstamp_enabled
2254  *   total_time_running = tstamp_stopped - tstamp_running
2255  *
2256  * We can insert 'tstamp_stopped == now' and reverse them to compute new
2257  * tstamp_* values.
2258  */
2259 static void __perf_event_enable_time(struct perf_event *event, u64 now)
2260 {
2261         WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
2262
2263         event->tstamp_stopped = now;
2264         event->tstamp_enabled = now - event->total_time_enabled;
2265         event->tstamp_running = now - event->total_time_running;
2266 }
2267
2268 static void add_event_to_ctx(struct perf_event *event,
2269                                struct perf_event_context *ctx)
2270 {
2271         u64 tstamp = perf_event_time(event);
2272
2273         list_add_event(event, ctx);
2274         perf_group_attach(event);
2275         /*
2276          * We can be called with event->state == STATE_OFF when we create with
2277          * .disabled = 1. In that case the IOC_ENABLE will call this function.
2278          */
2279         if (event->state == PERF_EVENT_STATE_INACTIVE)
2280                 __perf_event_enable_time(event, tstamp);
2281 }
2282
2283 static void ctx_sched_out(struct perf_event_context *ctx,
2284                           struct perf_cpu_context *cpuctx,
2285                           enum event_type_t event_type);
2286 static void
2287 ctx_sched_in(struct perf_event_context *ctx,
2288              struct perf_cpu_context *cpuctx,
2289              enum event_type_t event_type,
2290              struct task_struct *task);
2291
2292 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2293                                struct perf_event_context *ctx,
2294                                enum event_type_t event_type)
2295 {
2296         if (!cpuctx->task_ctx)
2297                 return;
2298
2299         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2300                 return;
2301
2302         ctx_sched_out(ctx, cpuctx, event_type);
2303 }
2304
2305 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2306                                 struct perf_event_context *ctx,
2307                                 struct task_struct *task)
2308 {
2309         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2310         if (ctx)
2311                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2312         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2313         if (ctx)
2314                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2315 }
2316
2317 /*
2318  * We want to maintain the following priority of scheduling:
2319  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2320  *  - task pinned (EVENT_PINNED)
2321  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2322  *  - task flexible (EVENT_FLEXIBLE).
2323  *
2324  * In order to avoid unscheduling and scheduling back in everything every
2325  * time an event is added, only do it for the groups of equal priority and
2326  * below.
2327  *
2328  * This can be called after a batch operation on task events, in which case
2329  * event_type is a bit mask of the types of events involved. For CPU events,
2330  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2331  */
2332 static void ctx_resched(struct perf_cpu_context *cpuctx,
2333                         struct perf_event_context *task_ctx,
2334                         enum event_type_t event_type)
2335 {
2336         enum event_type_t ctx_event_type;
2337         bool cpu_event = !!(event_type & EVENT_CPU);
2338
2339         /*
2340          * If pinned groups are involved, flexible groups also need to be
2341          * scheduled out.
2342          */
2343         if (event_type & EVENT_PINNED)
2344                 event_type |= EVENT_FLEXIBLE;
2345
2346         ctx_event_type = event_type & EVENT_ALL;
2347
2348         perf_pmu_disable(cpuctx->ctx.pmu);
2349         if (task_ctx)
2350                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2351
2352         /*
2353          * Decide which cpu ctx groups to schedule out based on the types
2354          * of events that caused rescheduling:
2355          *  - EVENT_CPU: schedule out corresponding groups;
2356          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2357          *  - otherwise, do nothing more.
2358          */
2359         if (cpu_event)
2360                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2361         else if (ctx_event_type & EVENT_PINNED)
2362                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2363
2364         perf_event_sched_in(cpuctx, task_ctx, current);
2365         perf_pmu_enable(cpuctx->ctx.pmu);
2366 }
2367
2368 /*
2369  * Cross CPU call to install and enable a performance event
2370  *
2371  * Very similar to remote_function() + event_function() but cannot assume that
2372  * things like ctx->is_active and cpuctx->task_ctx are set.
2373  */
2374 static int  __perf_install_in_context(void *info)
2375 {
2376         struct perf_event *event = info;
2377         struct perf_event_context *ctx = event->ctx;
2378         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2379         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2380         bool reprogram = true;
2381         int ret = 0;
2382
2383         raw_spin_lock(&cpuctx->ctx.lock);
2384         if (ctx->task) {
2385                 raw_spin_lock(&ctx->lock);
2386                 task_ctx = ctx;
2387
2388                 reprogram = (ctx->task == current);
2389
2390                 /*
2391                  * If the task is running, it must be running on this CPU,
2392                  * otherwise we cannot reprogram things.
2393                  *
2394                  * If its not running, we don't care, ctx->lock will
2395                  * serialize against it becoming runnable.
2396                  */
2397                 if (task_curr(ctx->task) && !reprogram) {
2398                         ret = -ESRCH;
2399                         goto unlock;
2400                 }
2401
2402                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2403         } else if (task_ctx) {
2404                 raw_spin_lock(&task_ctx->lock);
2405         }
2406
2407         if (reprogram) {
2408                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2409                 add_event_to_ctx(event, ctx);
2410                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2411         } else {
2412                 add_event_to_ctx(event, ctx);
2413         }
2414
2415 unlock:
2416         perf_ctx_unlock(cpuctx, task_ctx);
2417
2418         return ret;
2419 }
2420
2421 /*
2422  * Attach a performance event to a context.
2423  *
2424  * Very similar to event_function_call, see comment there.
2425  */
2426 static void
2427 perf_install_in_context(struct perf_event_context *ctx,
2428                         struct perf_event *event,
2429                         int cpu)
2430 {
2431         struct task_struct *task = READ_ONCE(ctx->task);
2432
2433         lockdep_assert_held(&ctx->mutex);
2434
2435         if (event->cpu != -1)
2436                 event->cpu = cpu;
2437
2438         /*
2439          * Ensures that if we can observe event->ctx, both the event and ctx
2440          * will be 'complete'. See perf_iterate_sb_cpu().
2441          */
2442         smp_store_release(&event->ctx, ctx);
2443
2444         if (!task) {
2445                 cpu_function_call(cpu, __perf_install_in_context, event);
2446                 return;
2447         }
2448
2449         /*
2450          * Should not happen, we validate the ctx is still alive before calling.
2451          */
2452         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2453                 return;
2454
2455         /*
2456          * Installing events is tricky because we cannot rely on ctx->is_active
2457          * to be set in case this is the nr_events 0 -> 1 transition.
2458          *
2459          * Instead we use task_curr(), which tells us if the task is running.
2460          * However, since we use task_curr() outside of rq::lock, we can race
2461          * against the actual state. This means the result can be wrong.
2462          *
2463          * If we get a false positive, we retry, this is harmless.
2464          *
2465          * If we get a false negative, things are complicated. If we are after
2466          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2467          * value must be correct. If we're before, it doesn't matter since
2468          * perf_event_context_sched_in() will program the counter.
2469          *
2470          * However, this hinges on the remote context switch having observed
2471          * our task->perf_event_ctxp[] store, such that it will in fact take
2472          * ctx::lock in perf_event_context_sched_in().
2473          *
2474          * We do this by task_function_call(), if the IPI fails to hit the task
2475          * we know any future context switch of task must see the
2476          * perf_event_ctpx[] store.
2477          */
2478
2479         /*
2480          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2481          * task_cpu() load, such that if the IPI then does not find the task
2482          * running, a future context switch of that task must observe the
2483          * store.
2484          */
2485         smp_mb();
2486 again:
2487         if (!task_function_call(task, __perf_install_in_context, event))
2488                 return;
2489
2490         raw_spin_lock_irq(&ctx->lock);
2491         task = ctx->task;
2492         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2493                 /*
2494                  * Cannot happen because we already checked above (which also
2495                  * cannot happen), and we hold ctx->mutex, which serializes us
2496                  * against perf_event_exit_task_context().
2497                  */
2498                 raw_spin_unlock_irq(&ctx->lock);
2499                 return;
2500         }
2501         /*
2502          * If the task is not running, ctx->lock will avoid it becoming so,
2503          * thus we can safely install the event.
2504          */
2505         if (task_curr(task)) {
2506                 raw_spin_unlock_irq(&ctx->lock);
2507                 goto again;
2508         }
2509         add_event_to_ctx(event, ctx);
2510         raw_spin_unlock_irq(&ctx->lock);
2511 }
2512
2513 /*
2514  * Put a event into inactive state and update time fields.
2515  * Enabling the leader of a group effectively enables all
2516  * the group members that aren't explicitly disabled, so we
2517  * have to update their ->tstamp_enabled also.
2518  * Note: this works for group members as well as group leaders
2519  * since the non-leader members' sibling_lists will be empty.
2520  */
2521 static void __perf_event_mark_enabled(struct perf_event *event)
2522 {
2523         struct perf_event *sub;
2524         u64 tstamp = perf_event_time(event);
2525
2526         event->state = PERF_EVENT_STATE_INACTIVE;
2527         __perf_event_enable_time(event, tstamp);
2528         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2529                 /* XXX should not be > INACTIVE if event isn't */
2530                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2531                         __perf_event_enable_time(sub, tstamp);
2532         }
2533 }
2534
2535 /*
2536  * Cross CPU call to enable a performance event
2537  */
2538 static void __perf_event_enable(struct perf_event *event,
2539                                 struct perf_cpu_context *cpuctx,
2540                                 struct perf_event_context *ctx,
2541                                 void *info)
2542 {
2543         struct perf_event *leader = event->group_leader;
2544         struct perf_event_context *task_ctx;
2545
2546         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2547             event->state <= PERF_EVENT_STATE_ERROR)
2548                 return;
2549
2550         if (ctx->is_active)
2551                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2552
2553         __perf_event_mark_enabled(event);
2554
2555         if (!ctx->is_active)
2556                 return;
2557
2558         if (!event_filter_match(event)) {
2559                 if (is_cgroup_event(event))
2560                         perf_cgroup_defer_enabled(event);
2561                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2562                 return;
2563         }
2564
2565         /*
2566          * If the event is in a group and isn't the group leader,
2567          * then don't put it on unless the group is on.
2568          */
2569         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2570                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2571                 return;
2572         }
2573
2574         task_ctx = cpuctx->task_ctx;
2575         if (ctx->task)
2576                 WARN_ON_ONCE(task_ctx != ctx);
2577
2578         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2579 }
2580
2581 /*
2582  * Enable a event.
2583  *
2584  * If event->ctx is a cloned context, callers must make sure that
2585  * every task struct that event->ctx->task could possibly point to
2586  * remains valid.  This condition is satisfied when called through
2587  * perf_event_for_each_child or perf_event_for_each as described
2588  * for perf_event_disable.
2589  */
2590 static void _perf_event_enable(struct perf_event *event)
2591 {
2592         struct perf_event_context *ctx = event->ctx;
2593
2594         raw_spin_lock_irq(&ctx->lock);
2595         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2596             event->state <  PERF_EVENT_STATE_ERROR) {
2597                 raw_spin_unlock_irq(&ctx->lock);
2598                 return;
2599         }
2600
2601         /*
2602          * If the event is in error state, clear that first.
2603          *
2604          * That way, if we see the event in error state below, we know that it
2605          * has gone back into error state, as distinct from the task having
2606          * been scheduled away before the cross-call arrived.
2607          */
2608         if (event->state == PERF_EVENT_STATE_ERROR)
2609                 event->state = PERF_EVENT_STATE_OFF;
2610         raw_spin_unlock_irq(&ctx->lock);
2611
2612         event_function_call(event, __perf_event_enable, NULL);
2613 }
2614
2615 /*
2616  * See perf_event_disable();
2617  */
2618 void perf_event_enable(struct perf_event *event)
2619 {
2620         struct perf_event_context *ctx;
2621
2622         ctx = perf_event_ctx_lock(event);
2623         _perf_event_enable(event);
2624         perf_event_ctx_unlock(event, ctx);
2625 }
2626 EXPORT_SYMBOL_GPL(perf_event_enable);
2627
2628 struct stop_event_data {
2629         struct perf_event       *event;
2630         unsigned int            restart;
2631 };
2632
2633 static int __perf_event_stop(void *info)
2634 {
2635         struct stop_event_data *sd = info;
2636         struct perf_event *event = sd->event;
2637
2638         /* if it's already INACTIVE, do nothing */
2639         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2640                 return 0;
2641
2642         /* matches smp_wmb() in event_sched_in() */
2643         smp_rmb();
2644
2645         /*
2646          * There is a window with interrupts enabled before we get here,
2647          * so we need to check again lest we try to stop another CPU's event.
2648          */
2649         if (READ_ONCE(event->oncpu) != smp_processor_id())
2650                 return -EAGAIN;
2651
2652         event->pmu->stop(event, PERF_EF_UPDATE);
2653
2654         /*
2655          * May race with the actual stop (through perf_pmu_output_stop()),
2656          * but it is only used for events with AUX ring buffer, and such
2657          * events will refuse to restart because of rb::aux_mmap_count==0,
2658          * see comments in perf_aux_output_begin().
2659          *
2660          * Since this is happening on a event-local CPU, no trace is lost
2661          * while restarting.
2662          */
2663         if (sd->restart)
2664                 event->pmu->start(event, 0);
2665
2666         return 0;
2667 }
2668
2669 static int perf_event_stop(struct perf_event *event, int restart)
2670 {
2671         struct stop_event_data sd = {
2672                 .event          = event,
2673                 .restart        = restart,
2674         };
2675         int ret = 0;
2676
2677         do {
2678                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2679                         return 0;
2680
2681                 /* matches smp_wmb() in event_sched_in() */
2682                 smp_rmb();
2683
2684                 /*
2685                  * We only want to restart ACTIVE events, so if the event goes
2686                  * inactive here (event->oncpu==-1), there's nothing more to do;
2687                  * fall through with ret==-ENXIO.
2688                  */
2689                 ret = cpu_function_call(READ_ONCE(event->oncpu),
2690                                         __perf_event_stop, &sd);
2691         } while (ret == -EAGAIN);
2692
2693         return ret;
2694 }
2695
2696 /*
2697  * In order to contain the amount of racy and tricky in the address filter
2698  * configuration management, it is a two part process:
2699  *
2700  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2701  *      we update the addresses of corresponding vmas in
2702  *      event::addr_filters_offs array and bump the event::addr_filters_gen;
2703  * (p2) when an event is scheduled in (pmu::add), it calls
2704  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2705  *      if the generation has changed since the previous call.
2706  *
2707  * If (p1) happens while the event is active, we restart it to force (p2).
2708  *
2709  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2710  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2711  *     ioctl;
2712  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2713  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2714  *     for reading;
2715  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2716  *     of exec.
2717  */
2718 void perf_event_addr_filters_sync(struct perf_event *event)
2719 {
2720         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2721
2722         if (!has_addr_filter(event))
2723                 return;
2724
2725         raw_spin_lock(&ifh->lock);
2726         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2727                 event->pmu->addr_filters_sync(event);
2728                 event->hw.addr_filters_gen = event->addr_filters_gen;
2729         }
2730         raw_spin_unlock(&ifh->lock);
2731 }
2732 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2733
2734 static int _perf_event_refresh(struct perf_event *event, int refresh)
2735 {
2736         /*
2737          * not supported on inherited events
2738          */
2739         if (event->attr.inherit || !is_sampling_event(event))
2740                 return -EINVAL;
2741
2742         atomic_add(refresh, &event->event_limit);
2743         _perf_event_enable(event);
2744
2745         return 0;
2746 }
2747
2748 /*
2749  * See perf_event_disable()
2750  */
2751 int perf_event_refresh(struct perf_event *event, int refresh)
2752 {
2753         struct perf_event_context *ctx;
2754         int ret;
2755
2756         ctx = perf_event_ctx_lock(event);
2757         ret = _perf_event_refresh(event, refresh);
2758         perf_event_ctx_unlock(event, ctx);
2759
2760         return ret;
2761 }
2762 EXPORT_SYMBOL_GPL(perf_event_refresh);
2763
2764 static void ctx_sched_out(struct perf_event_context *ctx,
2765                           struct perf_cpu_context *cpuctx,
2766                           enum event_type_t event_type)
2767 {
2768         int is_active = ctx->is_active;
2769         struct perf_event *event;
2770
2771         lockdep_assert_held(&ctx->lock);
2772
2773         if (likely(!ctx->nr_events)) {
2774                 /*
2775                  * See __perf_remove_from_context().
2776                  */
2777                 WARN_ON_ONCE(ctx->is_active);
2778                 if (ctx->task)
2779                         WARN_ON_ONCE(cpuctx->task_ctx);
2780                 return;
2781         }
2782
2783         ctx->is_active &= ~event_type;
2784         if (!(ctx->is_active & EVENT_ALL))
2785                 ctx->is_active = 0;
2786
2787         if (ctx->task) {
2788                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2789                 if (!ctx->is_active)
2790                         cpuctx->task_ctx = NULL;
2791         }
2792
2793         /*
2794          * Always update time if it was set; not only when it changes.
2795          * Otherwise we can 'forget' to update time for any but the last
2796          * context we sched out. For example:
2797          *
2798          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2799          *   ctx_sched_out(.event_type = EVENT_PINNED)
2800          *
2801          * would only update time for the pinned events.
2802          */
2803         if (is_active & EVENT_TIME) {
2804                 /* update (and stop) ctx time */
2805                 update_context_time(ctx);
2806                 update_cgrp_time_from_cpuctx(cpuctx);
2807         }
2808
2809         is_active ^= ctx->is_active; /* changed bits */
2810
2811         if (!ctx->nr_active || !(is_active & EVENT_ALL))
2812                 return;
2813
2814         perf_pmu_disable(ctx->pmu);
2815         if (is_active & EVENT_PINNED) {
2816                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2817                         group_sched_out(event, cpuctx, ctx);
2818         }
2819
2820         if (is_active & EVENT_FLEXIBLE) {
2821                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2822                         group_sched_out(event, cpuctx, ctx);
2823         }
2824         perf_pmu_enable(ctx->pmu);
2825 }
2826
2827 /*
2828  * Test whether two contexts are equivalent, i.e. whether they have both been
2829  * cloned from the same version of the same context.
2830  *
2831  * Equivalence is measured using a generation number in the context that is
2832  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2833  * and list_del_event().
2834  */
2835 static int context_equiv(struct perf_event_context *ctx1,
2836                          struct perf_event_context *ctx2)
2837 {
2838         lockdep_assert_held(&ctx1->lock);
2839         lockdep_assert_held(&ctx2->lock);
2840
2841         /* Pinning disables the swap optimization */
2842         if (ctx1->pin_count || ctx2->pin_count)
2843                 return 0;
2844
2845         /* If ctx1 is the parent of ctx2 */
2846         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2847                 return 1;
2848
2849         /* If ctx2 is the parent of ctx1 */
2850         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2851                 return 1;
2852
2853         /*
2854          * If ctx1 and ctx2 have the same parent; we flatten the parent
2855          * hierarchy, see perf_event_init_context().
2856          */
2857         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2858                         ctx1->parent_gen == ctx2->parent_gen)
2859                 return 1;
2860
2861         /* Unmatched */
2862         return 0;
2863 }
2864
2865 static void __perf_event_sync_stat(struct perf_event *event,
2866                                      struct perf_event *next_event)
2867 {
2868         u64 value;
2869
2870         if (!event->attr.inherit_stat)
2871                 return;
2872
2873         /*
2874          * Update the event value, we cannot use perf_event_read()
2875          * because we're in the middle of a context switch and have IRQs
2876          * disabled, which upsets smp_call_function_single(), however
2877          * we know the event must be on the current CPU, therefore we
2878          * don't need to use it.
2879          */
2880         switch (event->state) {
2881         case PERF_EVENT_STATE_ACTIVE:
2882                 event->pmu->read(event);
2883                 /* fall-through */
2884
2885         case PERF_EVENT_STATE_INACTIVE:
2886                 update_event_times(event);
2887                 break;
2888
2889         default:
2890                 break;
2891         }
2892
2893         /*
2894          * In order to keep per-task stats reliable we need to flip the event
2895          * values when we flip the contexts.
2896          */
2897         value = local64_read(&next_event->count);
2898         value = local64_xchg(&event->count, value);
2899         local64_set(&next_event->count, value);
2900
2901         swap(event->total_time_enabled, next_event->total_time_enabled);
2902         swap(event->total_time_running, next_event->total_time_running);
2903
2904         /*
2905          * Since we swizzled the values, update the user visible data too.
2906          */
2907         perf_event_update_userpage(event);
2908         perf_event_update_userpage(next_event);
2909 }
2910
2911 static void perf_event_sync_stat(struct perf_event_context *ctx,
2912                                    struct perf_event_context *next_ctx)
2913 {
2914         struct perf_event *event, *next_event;
2915
2916         if (!ctx->nr_stat)
2917                 return;
2918
2919         update_context_time(ctx);
2920
2921         event = list_first_entry(&ctx->event_list,
2922                                    struct perf_event, event_entry);
2923
2924         next_event = list_first_entry(&next_ctx->event_list,
2925                                         struct perf_event, event_entry);
2926
2927         while (&event->event_entry != &ctx->event_list &&
2928                &next_event->event_entry != &next_ctx->event_list) {
2929
2930                 __perf_event_sync_stat(event, next_event);
2931
2932                 event = list_next_entry(event, event_entry);
2933                 next_event = list_next_entry(next_event, event_entry);
2934         }
2935 }
2936
2937 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2938                                          struct task_struct *next)
2939 {
2940         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2941         struct perf_event_context *next_ctx;
2942         struct perf_event_context *parent, *next_parent;
2943         struct perf_cpu_context *cpuctx;
2944         int do_switch = 1;
2945
2946         if (likely(!ctx))
2947                 return;
2948
2949         cpuctx = __get_cpu_context(ctx);
2950         if (!cpuctx->task_ctx)
2951                 return;
2952
2953         rcu_read_lock();
2954         next_ctx = next->perf_event_ctxp[ctxn];
2955         if (!next_ctx)
2956                 goto unlock;
2957
2958         parent = rcu_dereference(ctx->parent_ctx);
2959         next_parent = rcu_dereference(next_ctx->parent_ctx);
2960
2961         /* If neither context have a parent context; they cannot be clones. */
2962         if (!parent && !next_parent)
2963                 goto unlock;
2964
2965         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2966                 /*
2967                  * Looks like the two contexts are clones, so we might be
2968                  * able to optimize the context switch.  We lock both
2969                  * contexts and check that they are clones under the
2970                  * lock (including re-checking that neither has been
2971                  * uncloned in the meantime).  It doesn't matter which
2972                  * order we take the locks because no other cpu could
2973                  * be trying to lock both of these tasks.
2974                  */
2975                 raw_spin_lock(&ctx->lock);
2976                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2977                 if (context_equiv(ctx, next_ctx)) {
2978                         WRITE_ONCE(ctx->task, next);
2979                         WRITE_ONCE(next_ctx->task, task);
2980
2981                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2982
2983                         /*
2984                          * RCU_INIT_POINTER here is safe because we've not
2985                          * modified the ctx and the above modification of
2986                          * ctx->task and ctx->task_ctx_data are immaterial
2987                          * since those values are always verified under
2988                          * ctx->lock which we're now holding.
2989                          */
2990                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2991                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2992
2993                         do_switch = 0;
2994
2995                         perf_event_sync_stat(ctx, next_ctx);
2996                 }
2997                 raw_spin_unlock(&next_ctx->lock);
2998                 raw_spin_unlock(&ctx->lock);
2999         }
3000 unlock:
3001         rcu_read_unlock();
3002
3003         if (do_switch) {
3004                 raw_spin_lock(&ctx->lock);
3005                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3006                 raw_spin_unlock(&ctx->lock);
3007         }
3008 }
3009
3010 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3011
3012 void perf_sched_cb_dec(struct pmu *pmu)
3013 {
3014         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3015
3016         this_cpu_dec(perf_sched_cb_usages);
3017
3018         if (!--cpuctx->sched_cb_usage)
3019                 list_del(&cpuctx->sched_cb_entry);
3020 }
3021
3022
3023 void perf_sched_cb_inc(struct pmu *pmu)
3024 {
3025         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3026
3027         if (!cpuctx->sched_cb_usage++)
3028                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3029
3030         this_cpu_inc(perf_sched_cb_usages);
3031 }
3032
3033 /*
3034  * This function provides the context switch callback to the lower code
3035  * layer. It is invoked ONLY when the context switch callback is enabled.
3036  *
3037  * This callback is relevant even to per-cpu events; for example multi event
3038  * PEBS requires this to provide PID/TID information. This requires we flush
3039  * all queued PEBS records before we context switch to a new task.
3040  */
3041 static void perf_pmu_sched_task(struct task_struct *prev,
3042                                 struct task_struct *next,
3043                                 bool sched_in)
3044 {
3045         struct perf_cpu_context *cpuctx;
3046         struct pmu *pmu;
3047
3048         if (prev == next)
3049                 return;
3050
3051         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3052                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3053
3054                 if (WARN_ON_ONCE(!pmu->sched_task))
3055                         continue;
3056
3057                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3058                 perf_pmu_disable(pmu);
3059
3060                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3061
3062                 perf_pmu_enable(pmu);
3063                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3064         }
3065 }
3066
3067 static void perf_event_switch(struct task_struct *task,
3068                               struct task_struct *next_prev, bool sched_in);
3069
3070 #define for_each_task_context_nr(ctxn)                                  \
3071         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3072
3073 /*
3074  * Called from scheduler to remove the events of the current task,
3075  * with interrupts disabled.
3076  *
3077  * We stop each event and update the event value in event->count.
3078  *
3079  * This does not protect us against NMI, but disable()
3080  * sets the disabled bit in the control field of event _before_
3081  * accessing the event control register. If a NMI hits, then it will
3082  * not restart the event.
3083  */
3084 void __perf_event_task_sched_out(struct task_struct *task,
3085                                  struct task_struct *next)
3086 {
3087         int ctxn;
3088
3089         if (__this_cpu_read(perf_sched_cb_usages))
3090                 perf_pmu_sched_task(task, next, false);
3091
3092         if (atomic_read(&nr_switch_events))
3093                 perf_event_switch(task, next, false);
3094
3095         for_each_task_context_nr(ctxn)
3096                 perf_event_context_sched_out(task, ctxn, next);
3097
3098         /*
3099          * if cgroup events exist on this CPU, then we need
3100          * to check if we have to switch out PMU state.
3101          * cgroup event are system-wide mode only
3102          */
3103         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3104                 perf_cgroup_sched_out(task, next);
3105 }
3106
3107 /*
3108  * Called with IRQs disabled
3109  */
3110 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3111                               enum event_type_t event_type)
3112 {
3113         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3114 }
3115
3116 static void
3117 ctx_pinned_sched_in(struct perf_event_context *ctx,
3118                     struct perf_cpu_context *cpuctx)
3119 {
3120         struct perf_event *event;
3121
3122         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3123                 if (event->state <= PERF_EVENT_STATE_OFF)
3124                         continue;
3125                 if (!event_filter_match(event))
3126                         continue;
3127
3128                 /* may need to reset tstamp_enabled */
3129                 if (is_cgroup_event(event))
3130                         perf_cgroup_mark_enabled(event, ctx);
3131
3132                 if (group_can_go_on(event, cpuctx, 1))
3133                         group_sched_in(event, cpuctx, ctx);
3134
3135                 /*
3136                  * If this pinned group hasn't been scheduled,
3137                  * put it in error state.
3138                  */
3139                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3140                         update_group_times(event);
3141                         event->state = PERF_EVENT_STATE_ERROR;
3142                 }
3143         }
3144 }
3145
3146 static void
3147 ctx_flexible_sched_in(struct perf_event_context *ctx,
3148                       struct perf_cpu_context *cpuctx)
3149 {
3150         struct perf_event *event;
3151         int can_add_hw = 1;
3152
3153         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3154                 /* Ignore events in OFF or ERROR state */
3155                 if (event->state <= PERF_EVENT_STATE_OFF)
3156                         continue;
3157                 /*
3158                  * Listen to the 'cpu' scheduling filter constraint
3159                  * of events:
3160                  */
3161                 if (!event_filter_match(event))
3162                         continue;
3163
3164                 /* may need to reset tstamp_enabled */
3165                 if (is_cgroup_event(event))
3166                         perf_cgroup_mark_enabled(event, ctx);
3167
3168                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3169                         if (group_sched_in(event, cpuctx, ctx))
3170                                 can_add_hw = 0;
3171                 }
3172         }
3173 }
3174
3175 static void
3176 ctx_sched_in(struct perf_event_context *ctx,
3177              struct perf_cpu_context *cpuctx,
3178              enum event_type_t event_type,
3179              struct task_struct *task)
3180 {
3181         int is_active = ctx->is_active;
3182         u64 now;
3183
3184         lockdep_assert_held(&ctx->lock);
3185
3186         if (likely(!ctx->nr_events))
3187                 return;
3188
3189         ctx->is_active |= (event_type | EVENT_TIME);
3190         if (ctx->task) {
3191                 if (!is_active)
3192                         cpuctx->task_ctx = ctx;
3193                 else
3194                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3195         }
3196
3197         is_active ^= ctx->is_active; /* changed bits */
3198
3199         if (is_active & EVENT_TIME) {
3200                 /* start ctx time */
3201                 now = perf_clock();
3202                 ctx->timestamp = now;
3203                 perf_cgroup_set_timestamp(task, ctx);
3204         }
3205
3206         /*
3207          * First go through the list and put on any pinned groups
3208          * in order to give them the best chance of going on.
3209          */
3210         if (is_active & EVENT_PINNED)
3211                 ctx_pinned_sched_in(ctx, cpuctx);
3212
3213         /* Then walk through the lower prio flexible groups */
3214         if (is_active & EVENT_FLEXIBLE)
3215                 ctx_flexible_sched_in(ctx, cpuctx);
3216 }
3217
3218 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3219                              enum event_type_t event_type,
3220                              struct task_struct *task)
3221 {
3222         struct perf_event_context *ctx = &cpuctx->ctx;
3223
3224         ctx_sched_in(ctx, cpuctx, event_type, task);
3225 }
3226
3227 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3228                                         struct task_struct *task)
3229 {
3230         struct perf_cpu_context *cpuctx;
3231
3232         cpuctx = __get_cpu_context(ctx);
3233         if (cpuctx->task_ctx == ctx)
3234                 return;
3235
3236         perf_ctx_lock(cpuctx, ctx);
3237         /*
3238          * We must check ctx->nr_events while holding ctx->lock, such
3239          * that we serialize against perf_install_in_context().
3240          */
3241         if (!ctx->nr_events)
3242                 goto unlock;
3243
3244         perf_pmu_disable(ctx->pmu);
3245         /*
3246          * We want to keep the following priority order:
3247          * cpu pinned (that don't need to move), task pinned,
3248          * cpu flexible, task flexible.
3249          *
3250          * However, if task's ctx is not carrying any pinned
3251          * events, no need to flip the cpuctx's events around.
3252          */
3253         if (!list_empty(&ctx->pinned_groups))
3254                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3255         perf_event_sched_in(cpuctx, ctx, task);
3256         perf_pmu_enable(ctx->pmu);
3257
3258 unlock:
3259         perf_ctx_unlock(cpuctx, ctx);
3260 }
3261
3262 /*
3263  * Called from scheduler to add the events of the current task
3264  * with interrupts disabled.
3265  *
3266  * We restore the event value and then enable it.
3267  *
3268  * This does not protect us against NMI, but enable()
3269  * sets the enabled bit in the control field of event _before_
3270  * accessing the event control register. If a NMI hits, then it will
3271  * keep the event running.
3272  */
3273 void __perf_event_task_sched_in(struct task_struct *prev,
3274                                 struct task_struct *task)
3275 {
3276         struct perf_event_context *ctx;
3277         int ctxn;
3278
3279         /*
3280          * If cgroup events exist on this CPU, then we need to check if we have
3281          * to switch in PMU state; cgroup event are system-wide mode only.
3282          *
3283          * Since cgroup events are CPU events, we must schedule these in before
3284          * we schedule in the task events.
3285          */
3286         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3287                 perf_cgroup_sched_in(prev, task);
3288
3289         for_each_task_context_nr(ctxn) {
3290                 ctx = task->perf_event_ctxp[ctxn];
3291                 if (likely(!ctx))
3292                         continue;
3293
3294                 perf_event_context_sched_in(ctx, task);
3295         }
3296
3297         if (atomic_read(&nr_switch_events))
3298                 perf_event_switch(task, prev, true);
3299
3300         if (__this_cpu_read(perf_sched_cb_usages))
3301                 perf_pmu_sched_task(prev, task, true);
3302 }
3303
3304 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3305 {
3306         u64 frequency = event->attr.sample_freq;
3307         u64 sec = NSEC_PER_SEC;
3308         u64 divisor, dividend;
3309
3310         int count_fls, nsec_fls, frequency_fls, sec_fls;
3311
3312         count_fls = fls64(count);
3313         nsec_fls = fls64(nsec);
3314         frequency_fls = fls64(frequency);
3315         sec_fls = 30;
3316
3317         /*
3318          * We got @count in @nsec, with a target of sample_freq HZ
3319          * the target period becomes:
3320          *
3321          *             @count * 10^9
3322          * period = -------------------
3323          *          @nsec * sample_freq
3324          *
3325          */
3326
3327         /*
3328          * Reduce accuracy by one bit such that @a and @b converge
3329          * to a similar magnitude.
3330          */
3331 #define REDUCE_FLS(a, b)                \
3332 do {                                    \
3333         if (a##_fls > b##_fls) {        \
3334                 a >>= 1;                \
3335                 a##_fls--;              \
3336         } else {                        \
3337                 b >>= 1;                \
3338                 b##_fls--;              \
3339         }                               \
3340 } while (0)
3341
3342         /*
3343          * Reduce accuracy until either term fits in a u64, then proceed with
3344          * the other, so that finally we can do a u64/u64 division.
3345          */
3346         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3347                 REDUCE_FLS(nsec, frequency);
3348                 REDUCE_FLS(sec, count);
3349         }
3350
3351         if (count_fls + sec_fls > 64) {
3352                 divisor = nsec * frequency;
3353
3354                 while (count_fls + sec_fls > 64) {
3355                         REDUCE_FLS(count, sec);
3356                         divisor >>= 1;
3357                 }
3358
3359                 dividend = count * sec;
3360         } else {
3361                 dividend = count * sec;
3362
3363                 while (nsec_fls + frequency_fls > 64) {
3364                         REDUCE_FLS(nsec, frequency);
3365                         dividend >>= 1;
3366                 }
3367
3368                 divisor = nsec * frequency;
3369         }
3370
3371         if (!divisor)
3372                 return dividend;
3373
3374         return div64_u64(dividend, divisor);
3375 }
3376
3377 static DEFINE_PER_CPU(int, perf_throttled_count);
3378 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3379
3380 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3381 {
3382         struct hw_perf_event *hwc = &event->hw;
3383         s64 period, sample_period;
3384         s64 delta;
3385
3386         period = perf_calculate_period(event, nsec, count);
3387
3388         delta = (s64)(period - hwc->sample_period);
3389         delta = (delta + 7) / 8; /* low pass filter */
3390
3391         sample_period = hwc->sample_period + delta;
3392
3393         if (!sample_period)
3394                 sample_period = 1;
3395
3396         hwc->sample_period = sample_period;
3397
3398         if (local64_read(&hwc->period_left) > 8*sample_period) {
3399                 if (disable)
3400                         event->pmu->stop(event, PERF_EF_UPDATE);
3401
3402                 local64_set(&hwc->period_left, 0);
3403
3404                 if (disable)
3405                         event->pmu->start(event, PERF_EF_RELOAD);
3406         }
3407 }
3408
3409 /*
3410  * combine freq adjustment with unthrottling to avoid two passes over the
3411  * events. At the same time, make sure, having freq events does not change
3412  * the rate of unthrottling as that would introduce bias.
3413  */
3414 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3415                                            int needs_unthr)
3416 {
3417         struct perf_event *event;
3418         struct hw_perf_event *hwc;
3419         u64 now, period = TICK_NSEC;
3420         s64 delta;
3421
3422         /*
3423          * only need to iterate over all events iff:
3424          * - context have events in frequency mode (needs freq adjust)
3425          * - there are events to unthrottle on this cpu
3426          */
3427         if (!(ctx->nr_freq || needs_unthr))
3428                 return;
3429
3430         raw_spin_lock(&ctx->lock);
3431         perf_pmu_disable(ctx->pmu);
3432
3433         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3434                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3435                         continue;
3436
3437                 if (!event_filter_match(event))
3438                         continue;
3439
3440                 perf_pmu_disable(event->pmu);
3441
3442                 hwc = &event->hw;
3443
3444                 if (hwc->interrupts == MAX_INTERRUPTS) {
3445                         hwc->interrupts = 0;
3446                         perf_log_throttle(event, 1);
3447                         event->pmu->start(event, 0);
3448                 }
3449
3450                 if (!event->attr.freq || !event->attr.sample_freq)
3451                         goto next;
3452
3453                 /*
3454                  * stop the event and update event->count
3455                  */
3456                 event->pmu->stop(event, PERF_EF_UPDATE);
3457
3458                 now = local64_read(&event->count);
3459                 delta = now - hwc->freq_count_stamp;
3460                 hwc->freq_count_stamp = now;
3461
3462                 /*
3463                  * restart the event
3464                  * reload only if value has changed
3465                  * we have stopped the event so tell that
3466                  * to perf_adjust_period() to avoid stopping it
3467                  * twice.
3468                  */
3469                 if (delta > 0)
3470                         perf_adjust_period(event, period, delta, false);
3471
3472                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3473         next:
3474                 perf_pmu_enable(event->pmu);
3475         }
3476
3477         perf_pmu_enable(ctx->pmu);
3478         raw_spin_unlock(&ctx->lock);
3479 }
3480
3481 /*
3482  * Round-robin a context's events:
3483  */
3484 static void rotate_ctx(struct perf_event_context *ctx)
3485 {
3486         /*
3487          * Rotate the first entry last of non-pinned groups. Rotation might be
3488          * disabled by the inheritance code.
3489          */
3490         if (!ctx->rotate_disable)
3491                 list_rotate_left(&ctx->flexible_groups);
3492 }
3493
3494 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3495 {
3496         struct perf_event_context *ctx = NULL;
3497         int rotate = 0;
3498
3499         if (cpuctx->ctx.nr_events) {
3500                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3501                         rotate = 1;
3502         }
3503
3504         ctx = cpuctx->task_ctx;
3505         if (ctx && ctx->nr_events) {
3506                 if (ctx->nr_events != ctx->nr_active)
3507                         rotate = 1;
3508         }
3509
3510         if (!rotate)
3511                 goto done;
3512
3513         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3514         perf_pmu_disable(cpuctx->ctx.pmu);
3515
3516         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3517         if (ctx)
3518                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3519
3520         rotate_ctx(&cpuctx->ctx);
3521         if (ctx)
3522                 rotate_ctx(ctx);
3523
3524         perf_event_sched_in(cpuctx, ctx, current);
3525
3526         perf_pmu_enable(cpuctx->ctx.pmu);
3527         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3528 done:
3529
3530         return rotate;
3531 }
3532
3533 void perf_event_task_tick(void)
3534 {
3535         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3536         struct perf_event_context *ctx, *tmp;
3537         int throttled;
3538
3539         WARN_ON(!irqs_disabled());
3540
3541         __this_cpu_inc(perf_throttled_seq);
3542         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3543         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3544
3545         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3546                 perf_adjust_freq_unthr_context(ctx, throttled);
3547 }
3548
3549 static int event_enable_on_exec(struct perf_event *event,
3550                                 struct perf_event_context *ctx)
3551 {
3552         if (!event->attr.enable_on_exec)
3553                 return 0;
3554
3555         event->attr.enable_on_exec = 0;
3556         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3557                 return 0;
3558
3559         __perf_event_mark_enabled(event);
3560
3561         return 1;
3562 }
3563
3564 /*
3565  * Enable all of a task's events that have been marked enable-on-exec.
3566  * This expects task == current.
3567  */
3568 static void perf_event_enable_on_exec(int ctxn)
3569 {
3570         struct perf_event_context *ctx, *clone_ctx = NULL;
3571         enum event_type_t event_type = 0;
3572         struct perf_cpu_context *cpuctx;
3573         struct perf_event *event;
3574         unsigned long flags;
3575         int enabled = 0;
3576
3577         local_irq_save(flags);
3578         ctx = current->perf_event_ctxp[ctxn];
3579         if (!ctx || !ctx->nr_events)
3580                 goto out;
3581
3582         cpuctx = __get_cpu_context(ctx);
3583         perf_ctx_lock(cpuctx, ctx);
3584         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3585         list_for_each_entry(event, &ctx->event_list, event_entry) {
3586                 enabled |= event_enable_on_exec(event, ctx);
3587                 event_type |= get_event_type(event);
3588         }
3589
3590         /*
3591          * Unclone and reschedule this context if we enabled any event.
3592          */
3593         if (enabled) {
3594                 clone_ctx = unclone_ctx(ctx);
3595                 ctx_resched(cpuctx, ctx, event_type);
3596         } else {
3597                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3598         }
3599         perf_ctx_unlock(cpuctx, ctx);
3600
3601 out:
3602         local_irq_restore(flags);
3603
3604         if (clone_ctx)
3605                 put_ctx(clone_ctx);
3606 }
3607
3608 struct perf_read_data {
3609         struct perf_event *event;
3610         bool group;
3611         int ret;
3612 };
3613
3614 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3615 {
3616         u16 local_pkg, event_pkg;
3617
3618         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3619                 int local_cpu = smp_processor_id();
3620
3621                 event_pkg = topology_physical_package_id(event_cpu);
3622                 local_pkg = topology_physical_package_id(local_cpu);
3623
3624                 if (event_pkg == local_pkg)
3625                         return local_cpu;
3626         }
3627
3628         return event_cpu;
3629 }
3630
3631 /*
3632  * Cross CPU call to read the hardware event
3633  */
3634 static void __perf_event_read(void *info)
3635 {
3636         struct perf_read_data *data = info;
3637         struct perf_event *sub, *event = data->event;
3638         struct perf_event_context *ctx = event->ctx;
3639         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3640         struct pmu *pmu = event->pmu;
3641
3642         /*
3643          * If this is a task context, we need to check whether it is
3644          * the current task context of this cpu.  If not it has been
3645          * scheduled out before the smp call arrived.  In that case
3646          * event->count would have been updated to a recent sample
3647          * when the event was scheduled out.
3648          */
3649         if (ctx->task && cpuctx->task_ctx != ctx)
3650                 return;
3651
3652         raw_spin_lock(&ctx->lock);
3653         if (ctx->is_active) {
3654                 update_context_time(ctx);
3655                 update_cgrp_time_from_event(event);
3656         }
3657
3658         update_event_times(event);
3659         if (event->state != PERF_EVENT_STATE_ACTIVE)
3660                 goto unlock;
3661
3662         if (!data->group) {
3663                 pmu->read(event);
3664                 data->ret = 0;
3665                 goto unlock;
3666         }
3667
3668         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3669
3670         pmu->read(event);
3671
3672         list_for_each_entry(sub, &event->sibling_list, group_entry) {
3673                 update_event_times(sub);
3674                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3675                         /*
3676                          * Use sibling's PMU rather than @event's since
3677                          * sibling could be on different (eg: software) PMU.
3678                          */
3679                         sub->pmu->read(sub);
3680                 }
3681         }
3682
3683         data->ret = pmu->commit_txn(pmu);
3684
3685 unlock:
3686         raw_spin_unlock(&ctx->lock);
3687 }
3688
3689 static inline u64 perf_event_count(struct perf_event *event)
3690 {
3691         return local64_read(&event->count) + atomic64_read(&event->child_count);
3692 }
3693
3694 /*
3695  * NMI-safe method to read a local event, that is an event that
3696  * is:
3697  *   - either for the current task, or for this CPU
3698  *   - does not have inherit set, for inherited task events
3699  *     will not be local and we cannot read them atomically
3700  *   - must not have a pmu::count method
3701  */
3702 int perf_event_read_local(struct perf_event *event, u64 *value)
3703 {
3704         unsigned long flags;
3705         int ret = 0;
3706
3707         /*
3708          * Disabling interrupts avoids all counter scheduling (context
3709          * switches, timer based rotation and IPIs).
3710          */
3711         local_irq_save(flags);
3712
3713         /*
3714          * It must not be an event with inherit set, we cannot read
3715          * all child counters from atomic context.
3716          */
3717         if (event->attr.inherit) {
3718                 ret = -EOPNOTSUPP;
3719                 goto out;
3720         }
3721
3722         /* If this is a per-task event, it must be for current */
3723         if ((event->attach_state & PERF_ATTACH_TASK) &&
3724             event->hw.target != current) {
3725                 ret = -EINVAL;
3726                 goto out;
3727         }
3728
3729         /* If this is a per-CPU event, it must be for this CPU */
3730         if (!(event->attach_state & PERF_ATTACH_TASK) &&
3731             event->cpu != smp_processor_id()) {
3732                 ret = -EINVAL;
3733                 goto out;
3734         }
3735
3736         /*
3737          * If the event is currently on this CPU, its either a per-task event,
3738          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3739          * oncpu == -1).
3740          */
3741         if (event->oncpu == smp_processor_id())
3742                 event->pmu->read(event);
3743
3744         *value = local64_read(&event->count);
3745 out:
3746         local_irq_restore(flags);
3747
3748         return ret;
3749 }
3750
3751 static int perf_event_read(struct perf_event *event, bool group)
3752 {
3753         int event_cpu, ret = 0;
3754
3755         /*
3756          * If event is enabled and currently active on a CPU, update the
3757          * value in the event structure:
3758          */
3759         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3760                 struct perf_read_data data = {
3761                         .event = event,
3762                         .group = group,
3763                         .ret = 0,
3764                 };
3765
3766                 event_cpu = READ_ONCE(event->oncpu);
3767                 if ((unsigned)event_cpu >= nr_cpu_ids)
3768                         return 0;
3769
3770                 preempt_disable();
3771                 event_cpu = __perf_event_read_cpu(event, event_cpu);
3772
3773                 /*
3774                  * Purposely ignore the smp_call_function_single() return
3775                  * value.
3776                  *
3777                  * If event_cpu isn't a valid CPU it means the event got
3778                  * scheduled out and that will have updated the event count.
3779                  *
3780                  * Therefore, either way, we'll have an up-to-date event count
3781                  * after this.
3782                  */
3783                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
3784                 preempt_enable();
3785                 ret = data.ret;
3786         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3787                 struct perf_event_context *ctx = event->ctx;
3788                 unsigned long flags;
3789
3790                 raw_spin_lock_irqsave(&ctx->lock, flags);
3791                 /*
3792                  * may read while context is not active
3793                  * (e.g., thread is blocked), in that case
3794                  * we cannot update context time
3795                  */
3796                 if (ctx->is_active) {
3797                         update_context_time(ctx);
3798                         update_cgrp_time_from_event(event);
3799                 }
3800                 if (group)
3801                         update_group_times(event);
3802                 else
3803                         update_event_times(event);
3804                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3805         }
3806
3807         return ret;
3808 }
3809
3810 /*
3811  * Initialize the perf_event context in a task_struct:
3812  */
3813 static void __perf_event_init_context(struct perf_event_context *ctx)
3814 {
3815         raw_spin_lock_init(&ctx->lock);
3816         mutex_init(&ctx->mutex);
3817         INIT_LIST_HEAD(&ctx->active_ctx_list);
3818         INIT_LIST_HEAD(&ctx->pinned_groups);
3819         INIT_LIST_HEAD(&ctx->flexible_groups);
3820         INIT_LIST_HEAD(&ctx->event_list);
3821         atomic_set(&ctx->refcount, 1);
3822 }
3823
3824 static struct perf_event_context *
3825 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3826 {
3827         struct perf_event_context *ctx;
3828
3829         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3830         if (!ctx)
3831                 return NULL;
3832
3833         __perf_event_init_context(ctx);
3834         if (task) {
3835                 ctx->task = task;
3836                 get_task_struct(task);
3837         }
3838         ctx->pmu = pmu;
3839
3840         return ctx;
3841 }
3842
3843 static struct task_struct *
3844 find_lively_task_by_vpid(pid_t vpid)
3845 {
3846         struct task_struct *task;
3847
3848         rcu_read_lock();
3849         if (!vpid)
3850                 task = current;
3851         else
3852                 task = find_task_by_vpid(vpid);
3853         if (task)
3854                 get_task_struct(task);
3855         rcu_read_unlock();
3856
3857         if (!task)
3858                 return ERR_PTR(-ESRCH);
3859
3860         return task;
3861 }
3862
3863 /*
3864  * Returns a matching context with refcount and pincount.
3865  */
3866 static struct perf_event_context *
3867 find_get_context(struct pmu *pmu, struct task_struct *task,
3868                 struct perf_event *event)
3869 {
3870         struct perf_event_context *ctx, *clone_ctx = NULL;
3871         struct perf_cpu_context *cpuctx;
3872         void *task_ctx_data = NULL;
3873         unsigned long flags;
3874         int ctxn, err;
3875         int cpu = event->cpu;
3876
3877         if (!task) {
3878                 /* Must be root to operate on a CPU event: */
3879                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3880                         return ERR_PTR(-EACCES);
3881
3882                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3883                 ctx = &cpuctx->ctx;
3884                 get_ctx(ctx);
3885                 ++ctx->pin_count;
3886
3887                 return ctx;
3888         }
3889
3890         err = -EINVAL;
3891         ctxn = pmu->task_ctx_nr;
3892         if (ctxn < 0)
3893                 goto errout;
3894
3895         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3896                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3897                 if (!task_ctx_data) {
3898                         err = -ENOMEM;
3899                         goto errout;
3900                 }
3901         }
3902
3903 retry:
3904         ctx = perf_lock_task_context(task, ctxn, &flags);
3905         if (ctx) {
3906                 clone_ctx = unclone_ctx(ctx);
3907                 ++ctx->pin_count;
3908
3909                 if (task_ctx_data && !ctx->task_ctx_data) {
3910                         ctx->task_ctx_data = task_ctx_data;
3911                         task_ctx_data = NULL;
3912                 }
3913                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3914
3915                 if (clone_ctx)
3916                         put_ctx(clone_ctx);
3917         } else {
3918                 ctx = alloc_perf_context(pmu, task);
3919                 err = -ENOMEM;
3920                 if (!ctx)
3921                         goto errout;
3922
3923                 if (task_ctx_data) {
3924                         ctx->task_ctx_data = task_ctx_data;
3925                         task_ctx_data = NULL;
3926                 }
3927
3928                 err = 0;
3929                 mutex_lock(&task->perf_event_mutex);
3930                 /*
3931                  * If it has already passed perf_event_exit_task().
3932                  * we must see PF_EXITING, it takes this mutex too.
3933                  */
3934                 if (task->flags & PF_EXITING)
3935                         err = -ESRCH;
3936                 else if (task->perf_event_ctxp[ctxn])
3937                         err = -EAGAIN;
3938                 else {
3939                         get_ctx(ctx);
3940                         ++ctx->pin_count;
3941                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3942                 }
3943                 mutex_unlock(&task->perf_event_mutex);
3944
3945                 if (unlikely(err)) {
3946                         put_ctx(ctx);
3947
3948                         if (err == -EAGAIN)
3949                                 goto retry;
3950                         goto errout;
3951                 }
3952         }
3953
3954         kfree(task_ctx_data);
3955         return ctx;
3956
3957 errout:
3958         kfree(task_ctx_data);
3959         return ERR_PTR(err);
3960 }
3961
3962 static void perf_event_free_filter(struct perf_event *event);
3963 static void perf_event_free_bpf_prog(struct perf_event *event);
3964
3965 static void free_event_rcu(struct rcu_head *head)
3966 {
3967         struct perf_event *event;
3968
3969         event = container_of(head, struct perf_event, rcu_head);
3970         if (event->ns)
3971                 put_pid_ns(event->ns);
3972         perf_event_free_filter(event);
3973         kfree(event);
3974 }
3975
3976 static void ring_buffer_attach(struct perf_event *event,
3977                                struct ring_buffer *rb);
3978
3979 static void detach_sb_event(struct perf_event *event)
3980 {
3981         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3982
3983         raw_spin_lock(&pel->lock);
3984         list_del_rcu(&event->sb_list);
3985         raw_spin_unlock(&pel->lock);
3986 }
3987
3988 static bool is_sb_event(struct perf_event *event)
3989 {
3990         struct perf_event_attr *attr = &event->attr;
3991
3992         if (event->parent)
3993                 return false;
3994
3995         if (event->attach_state & PERF_ATTACH_TASK)
3996                 return false;
3997
3998         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3999             attr->comm || attr->comm_exec ||
4000             attr->task ||
4001             attr->context_switch)
4002                 return true;
4003         return false;
4004 }
4005
4006 static void unaccount_pmu_sb_event(struct perf_event *event)
4007 {
4008         if (is_sb_event(event))
4009                 detach_sb_event(event);
4010 }
4011
4012 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4013 {
4014         if (event->parent)
4015                 return;
4016
4017         if (is_cgroup_event(event))
4018                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4019 }
4020
4021 #ifdef CONFIG_NO_HZ_FULL
4022 static DEFINE_SPINLOCK(nr_freq_lock);
4023 #endif
4024
4025 static void unaccount_freq_event_nohz(void)
4026 {
4027 #ifdef CONFIG_NO_HZ_FULL
4028         spin_lock(&nr_freq_lock);
4029         if (atomic_dec_and_test(&nr_freq_events))
4030                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4031         spin_unlock(&nr_freq_lock);
4032 #endif
4033 }
4034
4035 static void unaccount_freq_event(void)
4036 {
4037         if (tick_nohz_full_enabled())
4038                 unaccount_freq_event_nohz();
4039         else
4040                 atomic_dec(&nr_freq_events);
4041 }
4042
4043 static void unaccount_event(struct perf_event *event)
4044 {
4045         bool dec = false;
4046
4047         if (event->parent)
4048                 return;
4049
4050         if (event->attach_state & PERF_ATTACH_TASK)
4051                 dec = true;
4052         if (event->attr.mmap || event->attr.mmap_data)
4053                 atomic_dec(&nr_mmap_events);
4054         if (event->attr.comm)
4055                 atomic_dec(&nr_comm_events);
4056         if (event->attr.namespaces)
4057                 atomic_dec(&nr_namespaces_events);
4058         if (event->attr.task)
4059                 atomic_dec(&nr_task_events);
4060         if (event->attr.freq)
4061                 unaccount_freq_event();
4062         if (event->attr.context_switch) {
4063                 dec = true;
4064                 atomic_dec(&nr_switch_events);
4065         }
4066         if (is_cgroup_event(event))
4067                 dec = true;
4068         if (has_branch_stack(event))
4069                 dec = true;
4070
4071         if (dec) {
4072                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4073                         schedule_delayed_work(&perf_sched_work, HZ);
4074         }
4075
4076         unaccount_event_cpu(event, event->cpu);
4077
4078         unaccount_pmu_sb_event(event);
4079 }
4080
4081 static void perf_sched_delayed(struct work_struct *work)
4082 {
4083         mutex_lock(&perf_sched_mutex);
4084         if (atomic_dec_and_test(&perf_sched_count))
4085                 static_branch_disable(&perf_sched_events);
4086         mutex_unlock(&perf_sched_mutex);
4087 }
4088
4089 /*
4090  * The following implement mutual exclusion of events on "exclusive" pmus
4091  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4092  * at a time, so we disallow creating events that might conflict, namely:
4093  *
4094  *  1) cpu-wide events in the presence of per-task events,
4095  *  2) per-task events in the presence of cpu-wide events,
4096  *  3) two matching events on the same context.
4097  *
4098  * The former two cases are handled in the allocation path (perf_event_alloc(),
4099  * _free_event()), the latter -- before the first perf_install_in_context().
4100  */
4101 static int exclusive_event_init(struct perf_event *event)
4102 {
4103         struct pmu *pmu = event->pmu;
4104
4105         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4106                 return 0;
4107
4108         /*
4109          * Prevent co-existence of per-task and cpu-wide events on the
4110          * same exclusive pmu.
4111          *
4112          * Negative pmu::exclusive_cnt means there are cpu-wide
4113          * events on this "exclusive" pmu, positive means there are
4114          * per-task events.
4115          *
4116          * Since this is called in perf_event_alloc() path, event::ctx
4117          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4118          * to mean "per-task event", because unlike other attach states it
4119          * never gets cleared.
4120          */
4121         if (event->attach_state & PERF_ATTACH_TASK) {
4122                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4123                         return -EBUSY;
4124         } else {
4125                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4126                         return -EBUSY;
4127         }
4128
4129         return 0;
4130 }
4131
4132 static void exclusive_event_destroy(struct perf_event *event)
4133 {
4134         struct pmu *pmu = event->pmu;
4135
4136         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4137                 return;
4138
4139         /* see comment in exclusive_event_init() */
4140         if (event->attach_state & PERF_ATTACH_TASK)
4141                 atomic_dec(&pmu->exclusive_cnt);
4142         else
4143                 atomic_inc(&pmu->exclusive_cnt);
4144 }
4145
4146 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4147 {
4148         if ((e1->pmu == e2->pmu) &&
4149             (e1->cpu == e2->cpu ||
4150              e1->cpu == -1 ||
4151              e2->cpu == -1))
4152                 return true;
4153         return false;
4154 }
4155
4156 /* Called under the same ctx::mutex as perf_install_in_context() */
4157 static bool exclusive_event_installable(struct perf_event *event,
4158                                         struct perf_event_context *ctx)
4159 {
4160         struct perf_event *iter_event;
4161         struct pmu *pmu = event->pmu;
4162
4163         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4164                 return true;
4165
4166         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4167                 if (exclusive_event_match(iter_event, event))
4168                         return false;
4169         }
4170
4171         return true;
4172 }
4173
4174 static void perf_addr_filters_splice(struct perf_event *event,
4175                                        struct list_head *head);
4176
4177 static void _free_event(struct perf_event *event)
4178 {
4179         irq_work_sync(&event->pending);
4180
4181         unaccount_event(event);
4182
4183         if (event->rb) {
4184                 /*
4185                  * Can happen when we close an event with re-directed output.
4186                  *
4187                  * Since we have a 0 refcount, perf_mmap_close() will skip
4188                  * over us; possibly making our ring_buffer_put() the last.
4189                  */
4190                 mutex_lock(&event->mmap_mutex);
4191                 ring_buffer_attach(event, NULL);
4192                 mutex_unlock(&event->mmap_mutex);
4193         }
4194
4195         if (is_cgroup_event(event))
4196                 perf_detach_cgroup(event);
4197
4198         if (!event->parent) {
4199                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4200                         put_callchain_buffers();
4201         }
4202
4203         perf_event_free_bpf_prog(event);
4204         perf_addr_filters_splice(event, NULL);
4205         kfree(event->addr_filters_offs);
4206
4207         if (event->destroy)
4208                 event->destroy(event);
4209
4210         if (event->ctx)
4211                 put_ctx(event->ctx);
4212
4213         if (event->hw.target)
4214                 put_task_struct(event->hw.target);
4215
4216         exclusive_event_destroy(event);
4217         module_put(event->pmu->module);
4218
4219         call_rcu(&event->rcu_head, free_event_rcu);
4220 }
4221
4222 /*
4223  * Used to free events which have a known refcount of 1, such as in error paths
4224  * where the event isn't exposed yet and inherited events.
4225  */
4226 static void free_event(struct perf_event *event)
4227 {
4228         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4229                                 "unexpected event refcount: %ld; ptr=%p\n",
4230                                 atomic_long_read(&event->refcount), event)) {
4231                 /* leak to avoid use-after-free */
4232                 return;
4233         }
4234
4235         _free_event(event);
4236 }
4237
4238 /*
4239  * Remove user event from the owner task.
4240  */
4241 static void perf_remove_from_owner(struct perf_event *event)
4242 {
4243         struct task_struct *owner;
4244
4245         rcu_read_lock();
4246         /*
4247          * Matches the smp_store_release() in perf_event_exit_task(). If we
4248          * observe !owner it means the list deletion is complete and we can
4249          * indeed free this event, otherwise we need to serialize on
4250          * owner->perf_event_mutex.
4251          */
4252         owner = READ_ONCE(event->owner);
4253         if (owner) {
4254                 /*
4255                  * Since delayed_put_task_struct() also drops the last
4256                  * task reference we can safely take a new reference
4257                  * while holding the rcu_read_lock().
4258                  */
4259                 get_task_struct(owner);
4260         }
4261         rcu_read_unlock();
4262
4263         if (owner) {
4264                 /*
4265                  * If we're here through perf_event_exit_task() we're already
4266                  * holding ctx->mutex which would be an inversion wrt. the
4267                  * normal lock order.
4268                  *
4269                  * However we can safely take this lock because its the child
4270                  * ctx->mutex.
4271                  */
4272                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4273
4274                 /*
4275                  * We have to re-check the event->owner field, if it is cleared
4276                  * we raced with perf_event_exit_task(), acquiring the mutex
4277                  * ensured they're done, and we can proceed with freeing the
4278                  * event.
4279                  */
4280                 if (event->owner) {
4281                         list_del_init(&event->owner_entry);
4282                         smp_store_release(&event->owner, NULL);
4283                 }
4284                 mutex_unlock(&owner->perf_event_mutex);
4285                 put_task_struct(owner);
4286         }
4287 }
4288
4289 static void put_event(struct perf_event *event)
4290 {
4291         if (!atomic_long_dec_and_test(&event->refcount))
4292                 return;
4293
4294         _free_event(event);
4295 }
4296
4297 /*
4298  * Kill an event dead; while event:refcount will preserve the event
4299  * object, it will not preserve its functionality. Once the last 'user'
4300  * gives up the object, we'll destroy the thing.
4301  */
4302 int perf_event_release_kernel(struct perf_event *event)
4303 {
4304         struct perf_event_context *ctx = event->ctx;
4305         struct perf_event *child, *tmp;
4306
4307         /*
4308          * If we got here through err_file: fput(event_file); we will not have
4309          * attached to a context yet.
4310          */
4311         if (!ctx) {
4312                 WARN_ON_ONCE(event->attach_state &
4313                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4314                 goto no_ctx;
4315         }
4316
4317         if (!is_kernel_event(event))
4318                 perf_remove_from_owner(event);
4319
4320         ctx = perf_event_ctx_lock(event);
4321         WARN_ON_ONCE(ctx->parent_ctx);
4322         perf_remove_from_context(event, DETACH_GROUP);
4323
4324         raw_spin_lock_irq(&ctx->lock);
4325         /*
4326          * Mark this event as STATE_DEAD, there is no external reference to it
4327          * anymore.
4328          *
4329          * Anybody acquiring event->child_mutex after the below loop _must_
4330          * also see this, most importantly inherit_event() which will avoid
4331          * placing more children on the list.
4332          *
4333          * Thus this guarantees that we will in fact observe and kill _ALL_
4334          * child events.
4335          */
4336         event->state = PERF_EVENT_STATE_DEAD;
4337         raw_spin_unlock_irq(&ctx->lock);
4338
4339         perf_event_ctx_unlock(event, ctx);
4340
4341 again:
4342         mutex_lock(&event->child_mutex);
4343         list_for_each_entry(child, &event->child_list, child_list) {
4344
4345                 /*
4346                  * Cannot change, child events are not migrated, see the
4347                  * comment with perf_event_ctx_lock_nested().
4348                  */
4349                 ctx = READ_ONCE(child->ctx);
4350                 /*
4351                  * Since child_mutex nests inside ctx::mutex, we must jump
4352                  * through hoops. We start by grabbing a reference on the ctx.
4353                  *
4354                  * Since the event cannot get freed while we hold the
4355                  * child_mutex, the context must also exist and have a !0
4356                  * reference count.
4357                  */
4358                 get_ctx(ctx);
4359
4360                 /*
4361                  * Now that we have a ctx ref, we can drop child_mutex, and
4362                  * acquire ctx::mutex without fear of it going away. Then we
4363                  * can re-acquire child_mutex.
4364                  */
4365                 mutex_unlock(&event->child_mutex);
4366                 mutex_lock(&ctx->mutex);
4367                 mutex_lock(&event->child_mutex);
4368
4369                 /*
4370                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4371                  * state, if child is still the first entry, it didn't get freed
4372                  * and we can continue doing so.
4373                  */
4374                 tmp = list_first_entry_or_null(&event->child_list,
4375                                                struct perf_event, child_list);
4376                 if (tmp == child) {
4377                         perf_remove_from_context(child, DETACH_GROUP);
4378                         list_del(&child->child_list);
4379                         free_event(child);
4380                         /*
4381                          * This matches the refcount bump in inherit_event();
4382                          * this can't be the last reference.
4383                          */
4384                         put_event(event);
4385                 }
4386
4387                 mutex_unlock(&event->child_mutex);
4388                 mutex_unlock(&ctx->mutex);
4389                 put_ctx(ctx);
4390                 goto again;
4391         }
4392         mutex_unlock(&event->child_mutex);
4393
4394 no_ctx:
4395         put_event(event); /* Must be the 'last' reference */
4396         return 0;
4397 }
4398 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4399
4400 /*
4401  * Called when the last reference to the file is gone.
4402  */
4403 static int perf_release(struct inode *inode, struct file *file)
4404 {
4405         perf_event_release_kernel(file->private_data);
4406         return 0;
4407 }
4408
4409 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4410 {
4411         struct perf_event *child;
4412         u64 total = 0;
4413
4414         *enabled = 0;
4415         *running = 0;
4416
4417         mutex_lock(&event->child_mutex);
4418
4419         (void)perf_event_read(event, false);
4420         total += perf_event_count(event);
4421
4422         *enabled += event->total_time_enabled +
4423                         atomic64_read(&event->child_total_time_enabled);
4424         *running += event->total_time_running +
4425                         atomic64_read(&event->child_total_time_running);
4426
4427         list_for_each_entry(child, &event->child_list, child_list) {
4428                 (void)perf_event_read(child, false);
4429                 total += perf_event_count(child);
4430                 *enabled += child->total_time_enabled;
4431                 *running += child->total_time_running;
4432         }
4433         mutex_unlock(&event->child_mutex);
4434
4435         return total;
4436 }
4437 EXPORT_SYMBOL_GPL(perf_event_read_value);
4438
4439 static int __perf_read_group_add(struct perf_event *leader,
4440                                         u64 read_format, u64 *values)
4441 {
4442         struct perf_event_context *ctx = leader->ctx;
4443         struct perf_event *sub;
4444         unsigned long flags;
4445         int n = 1; /* skip @nr */
4446         int ret;
4447
4448         ret = perf_event_read(leader, true);
4449         if (ret)
4450                 return ret;
4451
4452         raw_spin_lock_irqsave(&ctx->lock, flags);
4453
4454         /*
4455          * Since we co-schedule groups, {enabled,running} times of siblings
4456          * will be identical to those of the leader, so we only publish one
4457          * set.
4458          */
4459         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4460                 values[n++] += leader->total_time_enabled +
4461                         atomic64_read(&leader->child_total_time_enabled);
4462         }
4463
4464         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4465                 values[n++] += leader->total_time_running +
4466                         atomic64_read(&leader->child_total_time_running);
4467         }
4468
4469         /*
4470          * Write {count,id} tuples for every sibling.
4471          */
4472         values[n++] += perf_event_count(leader);
4473         if (read_format & PERF_FORMAT_ID)
4474                 values[n++] = primary_event_id(leader);
4475
4476         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4477                 values[n++] += perf_event_count(sub);
4478                 if (read_format & PERF_FORMAT_ID)
4479                         values[n++] = primary_event_id(sub);
4480         }
4481
4482         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4483         return 0;
4484 }
4485
4486 static int perf_read_group(struct perf_event *event,
4487                                    u64 read_format, char __user *buf)
4488 {
4489         struct perf_event *leader = event->group_leader, *child;
4490         struct perf_event_context *ctx = leader->ctx;
4491         int ret;
4492         u64 *values;
4493
4494         lockdep_assert_held(&ctx->mutex);
4495
4496         values = kzalloc(event->read_size, GFP_KERNEL);
4497         if (!values)
4498                 return -ENOMEM;
4499
4500         values[0] = 1 + leader->nr_siblings;
4501
4502         /*
4503          * By locking the child_mutex of the leader we effectively
4504          * lock the child list of all siblings.. XXX explain how.
4505          */
4506         mutex_lock(&leader->child_mutex);
4507
4508         ret = __perf_read_group_add(leader, read_format, values);
4509         if (ret)
4510                 goto unlock;
4511
4512         list_for_each_entry(child, &leader->child_list, child_list) {
4513                 ret = __perf_read_group_add(child, read_format, values);
4514                 if (ret)
4515                         goto unlock;
4516         }
4517
4518         mutex_unlock(&leader->child_mutex);
4519
4520         ret = event->read_size;
4521         if (copy_to_user(buf, values, event->read_size))
4522                 ret = -EFAULT;
4523         goto out;
4524
4525 unlock:
4526         mutex_unlock(&leader->child_mutex);
4527 out:
4528         kfree(values);
4529         return ret;
4530 }
4531
4532 static int perf_read_one(struct perf_event *event,
4533                                  u64 read_format, char __user *buf)
4534 {
4535         u64 enabled, running;
4536         u64 values[4];
4537         int n = 0;
4538
4539         values[n++] = perf_event_read_value(event, &enabled, &running);
4540         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4541                 values[n++] = enabled;
4542         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4543                 values[n++] = running;
4544         if (read_format & PERF_FORMAT_ID)
4545                 values[n++] = primary_event_id(event);
4546
4547         if (copy_to_user(buf, values, n * sizeof(u64)))
4548                 return -EFAULT;
4549
4550         return n * sizeof(u64);
4551 }
4552
4553 static bool is_event_hup(struct perf_event *event)
4554 {
4555         bool no_children;
4556
4557         if (event->state > PERF_EVENT_STATE_EXIT)
4558                 return false;
4559
4560         mutex_lock(&event->child_mutex);
4561         no_children = list_empty(&event->child_list);
4562         mutex_unlock(&event->child_mutex);
4563         return no_children;
4564 }
4565
4566 /*
4567  * Read the performance event - simple non blocking version for now
4568  */
4569 static ssize_t
4570 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4571 {
4572         u64 read_format = event->attr.read_format;
4573         int ret;
4574
4575         /*
4576          * Return end-of-file for a read on a event that is in
4577          * error state (i.e. because it was pinned but it couldn't be
4578          * scheduled on to the CPU at some point).
4579          */
4580         if (event->state == PERF_EVENT_STATE_ERROR)
4581                 return 0;
4582
4583         if (count < event->read_size)
4584                 return -ENOSPC;
4585
4586         WARN_ON_ONCE(event->ctx->parent_ctx);
4587         if (read_format & PERF_FORMAT_GROUP)
4588                 ret = perf_read_group(event, read_format, buf);
4589         else
4590                 ret = perf_read_one(event, read_format, buf);
4591
4592         return ret;
4593 }
4594
4595 static ssize_t
4596 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4597 {
4598         struct perf_event *event = file->private_data;
4599         struct perf_event_context *ctx;
4600         int ret;
4601
4602         ctx = perf_event_ctx_lock(event);
4603         ret = __perf_read(event, buf, count);
4604         perf_event_ctx_unlock(event, ctx);
4605
4606         return ret;
4607 }
4608
4609 static unsigned int perf_poll(struct file *file, poll_table *wait)
4610 {
4611         struct perf_event *event = file->private_data;
4612         struct ring_buffer *rb;
4613         unsigned int events = POLLHUP;
4614
4615         poll_wait(file, &event->waitq, wait);
4616
4617         if (is_event_hup(event))
4618                 return events;
4619
4620         /*
4621          * Pin the event->rb by taking event->mmap_mutex; otherwise
4622          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4623          */
4624         mutex_lock(&event->mmap_mutex);
4625         rb = event->rb;
4626         if (rb)
4627                 events = atomic_xchg(&rb->poll, 0);
4628         mutex_unlock(&event->mmap_mutex);
4629         return events;
4630 }
4631
4632 static void _perf_event_reset(struct perf_event *event)
4633 {
4634         (void)perf_event_read(event, false);
4635         local64_set(&event->count, 0);
4636         perf_event_update_userpage(event);
4637 }
4638
4639 /*
4640  * Holding the top-level event's child_mutex means that any
4641  * descendant process that has inherited this event will block
4642  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4643  * task existence requirements of perf_event_enable/disable.
4644  */
4645 static void perf_event_for_each_child(struct perf_event *event,
4646                                         void (*func)(struct perf_event *))
4647 {
4648         struct perf_event *child;
4649
4650         WARN_ON_ONCE(event->ctx->parent_ctx);
4651
4652         mutex_lock(&event->child_mutex);
4653         func(event);
4654         list_for_each_entry(child, &event->child_list, child_list)
4655                 func(child);
4656         mutex_unlock(&event->child_mutex);
4657 }
4658
4659 static void perf_event_for_each(struct perf_event *event,
4660                                   void (*func)(struct perf_event *))
4661 {
4662         struct perf_event_context *ctx = event->ctx;
4663         struct perf_event *sibling;
4664
4665         lockdep_assert_held(&ctx->mutex);
4666
4667         event = event->group_leader;
4668
4669         perf_event_for_each_child(event, func);
4670         list_for_each_entry(sibling, &event->sibling_list, group_entry)
4671                 perf_event_for_each_child(sibling, func);
4672 }
4673
4674 static void __perf_event_period(struct perf_event *event,
4675                                 struct perf_cpu_context *cpuctx,
4676                                 struct perf_event_context *ctx,
4677                                 void *info)
4678 {
4679         u64 value = *((u64 *)info);
4680         bool active;
4681
4682         if (event->attr.freq) {
4683                 event->attr.sample_freq = value;
4684         } else {
4685                 event->attr.sample_period = value;
4686                 event->hw.sample_period = value;
4687         }
4688
4689         active = (event->state == PERF_EVENT_STATE_ACTIVE);
4690         if (active) {
4691                 perf_pmu_disable(ctx->pmu);
4692                 /*
4693                  * We could be throttled; unthrottle now to avoid the tick
4694                  * trying to unthrottle while we already re-started the event.
4695                  */
4696                 if (event->hw.interrupts == MAX_INTERRUPTS) {
4697                         event->hw.interrupts = 0;
4698                         perf_log_throttle(event, 1);
4699                 }
4700                 event->pmu->stop(event, PERF_EF_UPDATE);
4701         }
4702
4703         local64_set(&event->hw.period_left, 0);
4704
4705         if (active) {
4706                 event->pmu->start(event, PERF_EF_RELOAD);
4707                 perf_pmu_enable(ctx->pmu);
4708         }
4709 }
4710
4711 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4712 {
4713         u64 value;
4714
4715         if (!is_sampling_event(event))
4716                 return -EINVAL;
4717
4718         if (copy_from_user(&value, arg, sizeof(value)))
4719                 return -EFAULT;
4720
4721         if (!value)
4722                 return -EINVAL;
4723
4724         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4725                 return -EINVAL;
4726
4727         event_function_call(event, __perf_event_period, &value);
4728
4729         return 0;
4730 }
4731
4732 static const struct file_operations perf_fops;
4733
4734 static inline int perf_fget_light(int fd, struct fd *p)
4735 {
4736         struct fd f = fdget(fd);
4737         if (!f.file)
4738                 return -EBADF;
4739
4740         if (f.file->f_op != &perf_fops) {
4741                 fdput(f);
4742                 return -EBADF;
4743         }
4744         *p = f;
4745         return 0;
4746 }
4747
4748 static int perf_event_set_output(struct perf_event *event,
4749                                  struct perf_event *output_event);
4750 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4751 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4752
4753 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4754 {
4755         void (*func)(struct perf_event *);
4756         u32 flags = arg;
4757
4758         switch (cmd) {
4759         case PERF_EVENT_IOC_ENABLE:
4760                 func = _perf_event_enable;
4761                 break;
4762         case PERF_EVENT_IOC_DISABLE:
4763                 func = _perf_event_disable;
4764                 break;
4765         case PERF_EVENT_IOC_RESET:
4766                 func = _perf_event_reset;
4767                 break;
4768
4769         case PERF_EVENT_IOC_REFRESH:
4770                 return _perf_event_refresh(event, arg);
4771
4772         case PERF_EVENT_IOC_PERIOD:
4773                 return perf_event_period(event, (u64 __user *)arg);
4774
4775         case PERF_EVENT_IOC_ID:
4776         {
4777                 u64 id = primary_event_id(event);
4778
4779                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4780                         return -EFAULT;
4781                 return 0;
4782         }
4783
4784         case PERF_EVENT_IOC_SET_OUTPUT:
4785         {
4786                 int ret;
4787                 if (arg != -1) {
4788                         struct perf_event *output_event;
4789                         struct fd output;
4790                         ret = perf_fget_light(arg, &output);
4791                         if (ret)
4792                                 return ret;
4793                         output_event = output.file->private_data;
4794                         ret = perf_event_set_output(event, output_event);
4795                         fdput(output);
4796                 } else {
4797                         ret = perf_event_set_output(event, NULL);
4798                 }
4799                 return ret;
4800         }
4801
4802         case PERF_EVENT_IOC_SET_FILTER:
4803                 return perf_event_set_filter(event, (void __user *)arg);
4804
4805         case PERF_EVENT_IOC_SET_BPF:
4806                 return perf_event_set_bpf_prog(event, arg);
4807
4808         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4809                 struct ring_buffer *rb;
4810
4811                 rcu_read_lock();
4812                 rb = rcu_dereference(event->rb);
4813                 if (!rb || !rb->nr_pages) {
4814                         rcu_read_unlock();
4815                         return -EINVAL;
4816                 }
4817                 rb_toggle_paused(rb, !!arg);
4818                 rcu_read_unlock();
4819                 return 0;
4820         }
4821         default:
4822                 return -ENOTTY;
4823         }
4824
4825         if (flags & PERF_IOC_FLAG_GROUP)
4826                 perf_event_for_each(event, func);
4827         else
4828                 perf_event_for_each_child(event, func);
4829
4830         return 0;
4831 }
4832
4833 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4834 {
4835         struct perf_event *event = file->private_data;
4836         struct perf_event_context *ctx;
4837         long ret;
4838
4839         ctx = perf_event_ctx_lock(event);
4840         ret = _perf_ioctl(event, cmd, arg);
4841         perf_event_ctx_unlock(event, ctx);
4842
4843         return ret;
4844 }
4845
4846 #ifdef CONFIG_COMPAT
4847 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4848                                 unsigned long arg)
4849 {
4850         switch (_IOC_NR(cmd)) {
4851         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4852         case _IOC_NR(PERF_EVENT_IOC_ID):
4853                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4854                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4855                         cmd &= ~IOCSIZE_MASK;
4856                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4857                 }
4858                 break;
4859         }
4860         return perf_ioctl(file, cmd, arg);
4861 }
4862 #else
4863 # define perf_compat_ioctl NULL
4864 #endif
4865
4866 int perf_event_task_enable(void)
4867 {
4868         struct perf_event_context *ctx;
4869         struct perf_event *event;
4870
4871         mutex_lock(&current->perf_event_mutex);
4872         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4873                 ctx = perf_event_ctx_lock(event);
4874                 perf_event_for_each_child(event, _perf_event_enable);
4875                 perf_event_ctx_unlock(event, ctx);
4876         }
4877         mutex_unlock(&current->perf_event_mutex);
4878
4879         return 0;
4880 }
4881
4882 int perf_event_task_disable(void)
4883 {
4884         struct perf_event_context *ctx;
4885         struct perf_event *event;
4886
4887         mutex_lock(&current->perf_event_mutex);
4888         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4889                 ctx = perf_event_ctx_lock(event);
4890                 perf_event_for_each_child(event, _perf_event_disable);
4891                 perf_event_ctx_unlock(event, ctx);
4892         }
4893         mutex_unlock(&current->perf_event_mutex);
4894
4895         return 0;
4896 }
4897
4898 static int perf_event_index(struct perf_event *event)
4899 {
4900         if (event->hw.state & PERF_HES_STOPPED)
4901                 return 0;
4902
4903         if (event->state != PERF_EVENT_STATE_ACTIVE)
4904                 return 0;
4905
4906         return event->pmu->event_idx(event);
4907 }
4908
4909 static void calc_timer_values(struct perf_event *event,
4910                                 u64 *now,
4911                                 u64 *enabled,
4912                                 u64 *running)
4913 {
4914         u64 ctx_time;
4915
4916         *now = perf_clock();
4917         ctx_time = event->shadow_ctx_time + *now;
4918         *enabled = ctx_time - event->tstamp_enabled;
4919         *running = ctx_time - event->tstamp_running;
4920 }
4921
4922 static void perf_event_init_userpage(struct perf_event *event)
4923 {
4924         struct perf_event_mmap_page *userpg;
4925         struct ring_buffer *rb;
4926
4927         rcu_read_lock();
4928         rb = rcu_dereference(event->rb);
4929         if (!rb)
4930                 goto unlock;
4931
4932         userpg = rb->user_page;
4933
4934         /* Allow new userspace to detect that bit 0 is deprecated */
4935         userpg->cap_bit0_is_deprecated = 1;
4936         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4937         userpg->data_offset = PAGE_SIZE;
4938         userpg->data_size = perf_data_size(rb);
4939
4940 unlock:
4941         rcu_read_unlock();
4942 }
4943
4944 void __weak arch_perf_update_userpage(
4945         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4946 {
4947 }
4948
4949 /*
4950  * Callers need to ensure there can be no nesting of this function, otherwise
4951  * the seqlock logic goes bad. We can not serialize this because the arch
4952  * code calls this from NMI context.
4953  */
4954 void perf_event_update_userpage(struct perf_event *event)
4955 {
4956         struct perf_event_mmap_page *userpg;
4957         struct ring_buffer *rb;
4958         u64 enabled, running, now;
4959
4960         rcu_read_lock();
4961         rb = rcu_dereference(event->rb);
4962         if (!rb)
4963                 goto unlock;
4964
4965         /*
4966          * compute total_time_enabled, total_time_running
4967          * based on snapshot values taken when the event
4968          * was last scheduled in.
4969          *
4970          * we cannot simply called update_context_time()
4971          * because of locking issue as we can be called in
4972          * NMI context
4973          */
4974         calc_timer_values(event, &now, &enabled, &running);
4975
4976         userpg = rb->user_page;
4977         /*
4978          * Disable preemption so as to not let the corresponding user-space
4979          * spin too long if we get preempted.
4980          */
4981         preempt_disable();
4982         ++userpg->lock;
4983         barrier();
4984         userpg->index = perf_event_index(event);
4985         userpg->offset = perf_event_count(event);
4986         if (userpg->index)
4987                 userpg->offset -= local64_read(&event->hw.prev_count);
4988
4989         userpg->time_enabled = enabled +
4990                         atomic64_read(&event->child_total_time_enabled);
4991
4992         userpg->time_running = running +
4993                         atomic64_read(&event->child_total_time_running);
4994
4995         arch_perf_update_userpage(event, userpg, now);
4996
4997         barrier();
4998         ++userpg->lock;
4999         preempt_enable();
5000 unlock:
5001         rcu_read_unlock();
5002 }
5003
5004 static int perf_mmap_fault(struct vm_fault *vmf)
5005 {
5006         struct perf_event *event = vmf->vma->vm_file->private_data;
5007         struct ring_buffer *rb;
5008         int ret = VM_FAULT_SIGBUS;
5009
5010         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5011                 if (vmf->pgoff == 0)
5012                         ret = 0;
5013                 return ret;
5014         }
5015
5016         rcu_read_lock();
5017         rb = rcu_dereference(event->rb);
5018         if (!rb)
5019                 goto unlock;
5020
5021         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5022                 goto unlock;
5023
5024         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5025         if (!vmf->page)
5026                 goto unlock;
5027
5028         get_page(vmf->page);
5029         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5030         vmf->page->index   = vmf->pgoff;
5031
5032         ret = 0;
5033 unlock:
5034         rcu_read_unlock();
5035
5036         return ret;
5037 }
5038
5039 static void ring_buffer_attach(struct perf_event *event,
5040                                struct ring_buffer *rb)
5041 {
5042         struct ring_buffer *old_rb = NULL;
5043         unsigned long flags;
5044
5045         if (event->rb) {
5046                 /*
5047                  * Should be impossible, we set this when removing
5048                  * event->rb_entry and wait/clear when adding event->rb_entry.
5049                  */
5050                 WARN_ON_ONCE(event->rcu_pending);
5051
5052                 old_rb = event->rb;
5053                 spin_lock_irqsave(&old_rb->event_lock, flags);
5054                 list_del_rcu(&event->rb_entry);
5055                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5056
5057                 event->rcu_batches = get_state_synchronize_rcu();
5058                 event->rcu_pending = 1;
5059         }
5060
5061         if (rb) {
5062                 if (event->rcu_pending) {
5063                         cond_synchronize_rcu(event->rcu_batches);
5064                         event->rcu_pending = 0;
5065                 }
5066
5067                 spin_lock_irqsave(&rb->event_lock, flags);
5068                 list_add_rcu(&event->rb_entry, &rb->event_list);
5069                 spin_unlock_irqrestore(&rb->event_lock, flags);
5070         }
5071
5072         /*
5073          * Avoid racing with perf_mmap_close(AUX): stop the event
5074          * before swizzling the event::rb pointer; if it's getting
5075          * unmapped, its aux_mmap_count will be 0 and it won't
5076          * restart. See the comment in __perf_pmu_output_stop().
5077          *
5078          * Data will inevitably be lost when set_output is done in
5079          * mid-air, but then again, whoever does it like this is
5080          * not in for the data anyway.
5081          */
5082         if (has_aux(event))
5083                 perf_event_stop(event, 0);
5084
5085         rcu_assign_pointer(event->rb, rb);
5086
5087         if (old_rb) {
5088                 ring_buffer_put(old_rb);
5089                 /*
5090                  * Since we detached before setting the new rb, so that we
5091                  * could attach the new rb, we could have missed a wakeup.
5092                  * Provide it now.
5093                  */
5094                 wake_up_all(&event->waitq);
5095         }
5096 }
5097
5098 static void ring_buffer_wakeup(struct perf_event *event)
5099 {
5100         struct ring_buffer *rb;
5101
5102         rcu_read_lock();
5103         rb = rcu_dereference(event->rb);
5104         if (rb) {
5105                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5106                         wake_up_all(&event->waitq);
5107         }
5108         rcu_read_unlock();
5109 }
5110
5111 struct ring_buffer *ring_buffer_get(struct perf_event *event)
5112 {
5113         struct ring_buffer *rb;
5114
5115         rcu_read_lock();
5116         rb = rcu_dereference(event->rb);
5117         if (rb) {
5118                 if (!atomic_inc_not_zero(&rb->refcount))
5119                         rb = NULL;
5120         }
5121         rcu_read_unlock();
5122
5123         return rb;
5124 }
5125
5126 void ring_buffer_put(struct ring_buffer *rb)
5127 {
5128         if (!atomic_dec_and_test(&rb->refcount))
5129                 return;
5130
5131         WARN_ON_ONCE(!list_empty(&rb->event_list));
5132
5133         call_rcu(&rb->rcu_head, rb_free_rcu);
5134 }
5135
5136 static void perf_mmap_open(struct vm_area_struct *vma)
5137 {
5138         struct perf_event *event = vma->vm_file->private_data;
5139
5140         atomic_inc(&event->mmap_count);
5141         atomic_inc(&event->rb->mmap_count);
5142
5143         if (vma->vm_pgoff)
5144                 atomic_inc(&event->rb->aux_mmap_count);
5145
5146         if (event->pmu->event_mapped)
5147                 event->pmu->event_mapped(event, vma->vm_mm);
5148 }
5149
5150 static void perf_pmu_output_stop(struct perf_event *event);
5151
5152 /*
5153  * A buffer can be mmap()ed multiple times; either directly through the same
5154  * event, or through other events by use of perf_event_set_output().
5155  *
5156  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5157  * the buffer here, where we still have a VM context. This means we need
5158  * to detach all events redirecting to us.
5159  */
5160 static void perf_mmap_close(struct vm_area_struct *vma)
5161 {
5162         struct perf_event *event = vma->vm_file->private_data;
5163
5164         struct ring_buffer *rb = ring_buffer_get(event);
5165         struct user_struct *mmap_user = rb->mmap_user;
5166         int mmap_locked = rb->mmap_locked;
5167         unsigned long size = perf_data_size(rb);
5168
5169         if (event->pmu->event_unmapped)
5170                 event->pmu->event_unmapped(event, vma->vm_mm);
5171
5172         /*
5173          * rb->aux_mmap_count will always drop before rb->mmap_count and
5174          * event->mmap_count, so it is ok to use event->mmap_mutex to
5175          * serialize with perf_mmap here.
5176          */
5177         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5178             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5179                 /*
5180                  * Stop all AUX events that are writing to this buffer,
5181                  * so that we can free its AUX pages and corresponding PMU
5182                  * data. Note that after rb::aux_mmap_count dropped to zero,
5183                  * they won't start any more (see perf_aux_output_begin()).
5184                  */
5185                 perf_pmu_output_stop(event);
5186
5187                 /* now it's safe to free the pages */
5188                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5189                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5190
5191                 /* this has to be the last one */
5192                 rb_free_aux(rb);
5193                 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5194
5195                 mutex_unlock(&event->mmap_mutex);
5196         }
5197
5198         atomic_dec(&rb->mmap_count);
5199
5200         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5201                 goto out_put;
5202
5203         ring_buffer_attach(event, NULL);
5204         mutex_unlock(&event->mmap_mutex);
5205
5206         /* If there's still other mmap()s of this buffer, we're done. */
5207         if (atomic_read(&rb->mmap_count))
5208                 goto out_put;
5209
5210         /*
5211          * No other mmap()s, detach from all other events that might redirect
5212          * into the now unreachable buffer. Somewhat complicated by the
5213          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5214          */
5215 again:
5216         rcu_read_lock();
5217         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5218                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5219                         /*
5220                          * This event is en-route to free_event() which will
5221                          * detach it and remove it from the list.
5222                          */
5223                         continue;
5224                 }
5225                 rcu_read_unlock();
5226
5227                 mutex_lock(&event->mmap_mutex);
5228                 /*
5229                  * Check we didn't race with perf_event_set_output() which can
5230                  * swizzle the rb from under us while we were waiting to
5231                  * acquire mmap_mutex.
5232                  *
5233                  * If we find a different rb; ignore this event, a next
5234                  * iteration will no longer find it on the list. We have to
5235                  * still restart the iteration to make sure we're not now
5236                  * iterating the wrong list.
5237                  */
5238                 if (event->rb == rb)
5239                         ring_buffer_attach(event, NULL);
5240
5241                 mutex_unlock(&event->mmap_mutex);
5242                 put_event(event);
5243
5244                 /*
5245                  * Restart the iteration; either we're on the wrong list or
5246                  * destroyed its integrity by doing a deletion.
5247                  */
5248                 goto again;
5249         }
5250         rcu_read_unlock();
5251
5252         /*
5253          * It could be there's still a few 0-ref events on the list; they'll
5254          * get cleaned up by free_event() -- they'll also still have their
5255          * ref on the rb and will free it whenever they are done with it.
5256          *
5257          * Aside from that, this buffer is 'fully' detached and unmapped,
5258          * undo the VM accounting.
5259          */
5260
5261         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5262         vma->vm_mm->pinned_vm -= mmap_locked;
5263         free_uid(mmap_user);
5264
5265 out_put:
5266         ring_buffer_put(rb); /* could be last */
5267 }
5268
5269 static const struct vm_operations_struct perf_mmap_vmops = {
5270         .open           = perf_mmap_open,
5271         .close          = perf_mmap_close, /* non mergable */
5272         .fault          = perf_mmap_fault,
5273         .page_mkwrite   = perf_mmap_fault,
5274 };
5275
5276 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5277 {
5278         struct perf_event *event = file->private_data;
5279         unsigned long user_locked, user_lock_limit;
5280         struct user_struct *user = current_user();
5281         unsigned long locked, lock_limit;
5282         struct ring_buffer *rb = NULL;
5283         unsigned long vma_size;
5284         unsigned long nr_pages;
5285         long user_extra = 0, extra = 0;
5286         int ret = 0, flags = 0;
5287
5288         /*
5289          * Don't allow mmap() of inherited per-task counters. This would
5290          * create a performance issue due to all children writing to the
5291          * same rb.
5292          */
5293         if (event->cpu == -1 && event->attr.inherit)
5294                 return -EINVAL;
5295
5296         if (!(vma->vm_flags & VM_SHARED))
5297                 return -EINVAL;
5298
5299         vma_size = vma->vm_end - vma->vm_start;
5300
5301         if (vma->vm_pgoff == 0) {
5302                 nr_pages = (vma_size / PAGE_SIZE) - 1;
5303         } else {
5304                 /*
5305                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5306                  * mapped, all subsequent mappings should have the same size
5307                  * and offset. Must be above the normal perf buffer.
5308                  */
5309                 u64 aux_offset, aux_size;
5310
5311                 if (!event->rb)
5312                         return -EINVAL;
5313
5314                 nr_pages = vma_size / PAGE_SIZE;
5315
5316                 mutex_lock(&event->mmap_mutex);
5317                 ret = -EINVAL;
5318
5319                 rb = event->rb;
5320                 if (!rb)
5321                         goto aux_unlock;
5322
5323                 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5324                 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5325
5326                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5327                         goto aux_unlock;
5328
5329                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5330                         goto aux_unlock;
5331
5332                 /* already mapped with a different offset */
5333                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5334                         goto aux_unlock;
5335
5336                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5337                         goto aux_unlock;
5338
5339                 /* already mapped with a different size */
5340                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5341                         goto aux_unlock;
5342
5343                 if (!is_power_of_2(nr_pages))
5344                         goto aux_unlock;
5345
5346                 if (!atomic_inc_not_zero(&rb->mmap_count))
5347                         goto aux_unlock;
5348
5349                 if (rb_has_aux(rb)) {
5350                         atomic_inc(&rb->aux_mmap_count);
5351                         ret = 0;
5352                         goto unlock;
5353                 }
5354
5355                 atomic_set(&rb->aux_mmap_count, 1);
5356                 user_extra = nr_pages;
5357
5358                 goto accounting;
5359         }
5360
5361         /*
5362          * If we have rb pages ensure they're a power-of-two number, so we
5363          * can do bitmasks instead of modulo.
5364          */
5365         if (nr_pages != 0 && !is_power_of_2(nr_pages))
5366                 return -EINVAL;
5367
5368         if (vma_size != PAGE_SIZE * (1 + nr_pages))
5369                 return -EINVAL;
5370
5371         WARN_ON_ONCE(event->ctx->parent_ctx);
5372 again:
5373         mutex_lock(&event->mmap_mutex);
5374         if (event->rb) {
5375                 if (event->rb->nr_pages != nr_pages) {
5376                         ret = -EINVAL;
5377                         goto unlock;
5378                 }
5379
5380                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5381                         /*
5382                          * Raced against perf_mmap_close() through
5383                          * perf_event_set_output(). Try again, hope for better
5384                          * luck.
5385                          */
5386                         mutex_unlock(&event->mmap_mutex);
5387                         goto again;
5388                 }
5389
5390                 goto unlock;
5391         }
5392
5393         user_extra = nr_pages + 1;
5394
5395 accounting:
5396         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5397
5398         /*
5399          * Increase the limit linearly with more CPUs:
5400          */
5401         user_lock_limit *= num_online_cpus();
5402
5403         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5404
5405         if (user_locked > user_lock_limit)
5406                 extra = user_locked - user_lock_limit;
5407
5408         lock_limit = rlimit(RLIMIT_MEMLOCK);
5409         lock_limit >>= PAGE_SHIFT;
5410         locked = vma->vm_mm->pinned_vm + extra;
5411
5412         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5413                 !capable(CAP_IPC_LOCK)) {
5414                 ret = -EPERM;
5415                 goto unlock;
5416         }
5417
5418         WARN_ON(!rb && event->rb);
5419
5420         if (vma->vm_flags & VM_WRITE)
5421                 flags |= RING_BUFFER_WRITABLE;
5422
5423         if (!rb) {
5424                 rb = rb_alloc(nr_pages,
5425                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
5426                               event->cpu, flags);
5427
5428                 if (!rb) {
5429                         ret = -ENOMEM;
5430                         goto unlock;
5431                 }
5432
5433                 atomic_set(&rb->mmap_count, 1);
5434                 rb->mmap_user = get_current_user();
5435                 rb->mmap_locked = extra;
5436
5437                 ring_buffer_attach(event, rb);
5438
5439                 perf_event_init_userpage(event);
5440                 perf_event_update_userpage(event);
5441         } else {
5442                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5443                                    event->attr.aux_watermark, flags);
5444                 if (!ret)
5445                         rb->aux_mmap_locked = extra;
5446         }
5447
5448 unlock:
5449         if (!ret) {
5450                 atomic_long_add(user_extra, &user->locked_vm);
5451                 vma->vm_mm->pinned_vm += extra;
5452
5453                 atomic_inc(&event->mmap_count);
5454         } else if (rb) {
5455                 atomic_dec(&rb->mmap_count);
5456         }
5457 aux_unlock:
5458         mutex_unlock(&event->mmap_mutex);
5459
5460         /*
5461          * Since pinned accounting is per vm we cannot allow fork() to copy our
5462          * vma.
5463          */
5464         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5465         vma->vm_ops = &perf_mmap_vmops;
5466
5467         if (event->pmu->event_mapped)
5468                 event->pmu->event_mapped(event, vma->vm_mm);
5469
5470         return ret;
5471 }
5472
5473 static int perf_fasync(int fd, struct file *filp, int on)
5474 {
5475         struct inode *inode = file_inode(filp);
5476         struct perf_event *event = filp->private_data;
5477         int retval;
5478
5479         inode_lock(inode);
5480         retval = fasync_helper(fd, filp, on, &event->fasync);
5481         inode_unlock(inode);
5482
5483         if (retval < 0)
5484                 return retval;
5485
5486         return 0;
5487 }
5488
5489 static const struct file_operations perf_fops = {
5490         .llseek                 = no_llseek,
5491         .release                = perf_release,
5492         .read                   = perf_read,
5493         .poll                   = perf_poll,
5494         .unlocked_ioctl         = perf_ioctl,
5495         .compat_ioctl           = perf_compat_ioctl,
5496         .mmap                   = perf_mmap,
5497         .fasync                 = perf_fasync,
5498 };
5499
5500 /*
5501  * Perf event wakeup
5502  *
5503  * If there's data, ensure we set the poll() state and publish everything
5504  * to user-space before waking everybody up.
5505  */
5506
5507 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5508 {
5509         /* only the parent has fasync state */
5510         if (event->parent)
5511                 event = event->parent;
5512         return &event->fasync;
5513 }
5514
5515 void perf_event_wakeup(struct perf_event *event)
5516 {
5517         ring_buffer_wakeup(event);
5518
5519         if (event->pending_kill) {
5520                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5521                 event->pending_kill = 0;
5522         }
5523 }
5524
5525 static void perf_pending_event(struct irq_work *entry)
5526 {
5527         struct perf_event *event = container_of(entry,
5528                         struct perf_event, pending);
5529         int rctx;
5530
5531         rctx = perf_swevent_get_recursion_context();
5532         /*
5533          * If we 'fail' here, that's OK, it means recursion is already disabled
5534          * and we won't recurse 'further'.
5535          */
5536
5537         if (event->pending_disable) {
5538                 event->pending_disable = 0;
5539                 perf_event_disable_local(event);
5540         }
5541
5542         if (event->pending_wakeup) {
5543                 event->pending_wakeup = 0;
5544                 perf_event_wakeup(event);
5545         }
5546
5547         if (rctx >= 0)
5548                 perf_swevent_put_recursion_context(rctx);
5549 }
5550
5551 /*
5552  * We assume there is only KVM supporting the callbacks.
5553  * Later on, we might change it to a list if there is
5554  * another virtualization implementation supporting the callbacks.
5555  */
5556 struct perf_guest_info_callbacks *perf_guest_cbs;
5557
5558 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5559 {
5560         perf_guest_cbs = cbs;
5561         return 0;
5562 }
5563 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5564
5565 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5566 {
5567         perf_guest_cbs = NULL;
5568         return 0;
5569 }
5570 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5571
5572 static void
5573 perf_output_sample_regs(struct perf_output_handle *handle,
5574                         struct pt_regs *regs, u64 mask)
5575 {
5576         int bit;
5577         DECLARE_BITMAP(_mask, 64);
5578
5579         bitmap_from_u64(_mask, mask);
5580         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5581                 u64 val;
5582
5583                 val = perf_reg_value(regs, bit);
5584                 perf_output_put(handle, val);
5585         }
5586 }
5587
5588 static void perf_sample_regs_user(struct perf_regs *regs_user,
5589                                   struct pt_regs *regs,
5590                                   struct pt_regs *regs_user_copy)
5591 {
5592         if (user_mode(regs)) {
5593                 regs_user->abi = perf_reg_abi(current);
5594                 regs_user->regs = regs;
5595         } else if (current->mm) {
5596                 perf_get_regs_user(regs_user, regs, regs_user_copy);
5597         } else {
5598                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5599                 regs_user->regs = NULL;
5600         }
5601 }
5602
5603 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5604                                   struct pt_regs *regs)
5605 {
5606         regs_intr->regs = regs;
5607         regs_intr->abi  = perf_reg_abi(current);
5608 }
5609
5610
5611 /*
5612  * Get remaining task size from user stack pointer.
5613  *
5614  * It'd be better to take stack vma map and limit this more
5615  * precisly, but there's no way to get it safely under interrupt,
5616  * so using TASK_SIZE as limit.
5617  */
5618 static u64 perf_ustack_task_size(struct pt_regs *regs)
5619 {
5620         unsigned long addr = perf_user_stack_pointer(regs);
5621
5622         if (!addr || addr >= TASK_SIZE)
5623                 return 0;
5624
5625         return TASK_SIZE - addr;
5626 }
5627
5628 static u16
5629 perf_sample_ustack_size(u16 stack_size, u16 header_size,
5630                         struct pt_regs *regs)
5631 {
5632         u64 task_size;
5633
5634         /* No regs, no stack pointer, no dump. */
5635         if (!regs)
5636                 return 0;
5637
5638         /*
5639          * Check if we fit in with the requested stack size into the:
5640          * - TASK_SIZE
5641          *   If we don't, we limit the size to the TASK_SIZE.
5642          *
5643          * - remaining sample size
5644          *   If we don't, we customize the stack size to
5645          *   fit in to the remaining sample size.
5646          */
5647
5648         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5649         stack_size = min(stack_size, (u16) task_size);
5650
5651         /* Current header size plus static size and dynamic size. */
5652         header_size += 2 * sizeof(u64);
5653
5654         /* Do we fit in with the current stack dump size? */
5655         if ((u16) (header_size + stack_size) < header_size) {
5656                 /*
5657                  * If we overflow the maximum size for the sample,
5658                  * we customize the stack dump size to fit in.
5659                  */
5660                 stack_size = USHRT_MAX - header_size - sizeof(u64);
5661                 stack_size = round_up(stack_size, sizeof(u64));
5662         }
5663
5664         return stack_size;
5665 }
5666
5667 static void
5668 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5669                           struct pt_regs *regs)
5670 {
5671         /* Case of a kernel thread, nothing to dump */
5672         if (!regs) {
5673                 u64 size = 0;
5674                 perf_output_put(handle, size);
5675         } else {
5676                 unsigned long sp;
5677                 unsigned int rem;
5678                 u64 dyn_size;
5679
5680                 /*
5681                  * We dump:
5682                  * static size
5683                  *   - the size requested by user or the best one we can fit
5684                  *     in to the sample max size
5685                  * data
5686                  *   - user stack dump data
5687                  * dynamic size
5688                  *   - the actual dumped size
5689                  */
5690
5691                 /* Static size. */
5692                 perf_output_put(handle, dump_size);
5693
5694                 /* Data. */
5695                 sp = perf_user_stack_pointer(regs);
5696                 rem = __output_copy_user(handle, (void *) sp, dump_size);
5697                 dyn_size = dump_size - rem;
5698
5699                 perf_output_skip(handle, rem);
5700
5701                 /* Dynamic size. */
5702                 perf_output_put(handle, dyn_size);
5703         }
5704 }
5705
5706 static void __perf_event_header__init_id(struct perf_event_header *header,
5707                                          struct perf_sample_data *data,
5708                                          struct perf_event *event)
5709 {
5710         u64 sample_type = event->attr.sample_type;
5711
5712         data->type = sample_type;
5713         header->size += event->id_header_size;
5714
5715         if (sample_type & PERF_SAMPLE_TID) {
5716                 /* namespace issues */
5717                 data->tid_entry.pid = perf_event_pid(event, current);
5718                 data->tid_entry.tid = perf_event_tid(event, current);
5719         }
5720
5721         if (sample_type & PERF_SAMPLE_TIME)
5722                 data->time = perf_event_clock(event);
5723
5724         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5725                 data->id = primary_event_id(event);
5726
5727         if (sample_type & PERF_SAMPLE_STREAM_ID)
5728                 data->stream_id = event->id;
5729
5730         if (sample_type & PERF_SAMPLE_CPU) {
5731                 data->cpu_entry.cpu      = raw_smp_processor_id();
5732                 data->cpu_entry.reserved = 0;
5733         }
5734 }
5735
5736 void perf_event_header__init_id(struct perf_event_header *header,
5737                                 struct perf_sample_data *data,
5738                                 struct perf_event *event)
5739 {
5740         if (event->attr.sample_id_all)
5741                 __perf_event_header__init_id(header, data, event);
5742 }
5743
5744 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5745                                            struct perf_sample_data *data)
5746 {
5747         u64 sample_type = data->type;
5748
5749         if (sample_type & PERF_SAMPLE_TID)
5750                 perf_output_put(handle, data->tid_entry);
5751
5752         if (sample_type & PERF_SAMPLE_TIME)
5753                 perf_output_put(handle, data->time);
5754
5755         if (sample_type & PERF_SAMPLE_ID)
5756                 perf_output_put(handle, data->id);
5757
5758         if (sample_type & PERF_SAMPLE_STREAM_ID)
5759                 perf_output_put(handle, data->stream_id);
5760
5761         if (sample_type & PERF_SAMPLE_CPU)
5762                 perf_output_put(handle, data->cpu_entry);
5763
5764         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5765                 perf_output_put(handle, data->id);
5766 }
5767
5768 void perf_event__output_id_sample(struct perf_event *event,
5769                                   struct perf_output_handle *handle,
5770                                   struct perf_sample_data *sample)
5771 {
5772         if (event->attr.sample_id_all)
5773                 __perf_event__output_id_sample(handle, sample);
5774 }
5775
5776 static void perf_output_read_one(struct perf_output_handle *handle,
5777                                  struct perf_event *event,
5778                                  u64 enabled, u64 running)
5779 {
5780         u64 read_format = event->attr.read_format;
5781         u64 values[4];
5782         int n = 0;
5783
5784         values[n++] = perf_event_count(event);
5785         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5786                 values[n++] = enabled +
5787                         atomic64_read(&event->child_total_time_enabled);
5788         }
5789         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5790                 values[n++] = running +
5791                         atomic64_read(&event->child_total_time_running);
5792         }
5793         if (read_format & PERF_FORMAT_ID)
5794                 values[n++] = primary_event_id(event);
5795
5796         __output_copy(handle, values, n * sizeof(u64));
5797 }
5798
5799 static void perf_output_read_group(struct perf_output_handle *handle,
5800                             struct perf_event *event,
5801                             u64 enabled, u64 running)
5802 {
5803         struct perf_event *leader = event->group_leader, *sub;
5804         u64 read_format = event->attr.read_format;
5805         u64 values[5];
5806         int n = 0;
5807
5808         values[n++] = 1 + leader->nr_siblings;
5809
5810         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5811                 values[n++] = enabled;
5812
5813         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5814                 values[n++] = running;
5815
5816         if (leader != event)
5817                 leader->pmu->read(leader);
5818
5819         values[n++] = perf_event_count(leader);
5820         if (read_format & PERF_FORMAT_ID)
5821                 values[n++] = primary_event_id(leader);
5822
5823         __output_copy(handle, values, n * sizeof(u64));
5824
5825         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5826                 n = 0;
5827
5828                 if ((sub != event) &&
5829                     (sub->state == PERF_EVENT_STATE_ACTIVE))
5830                         sub->pmu->read(sub);
5831
5832                 values[n++] = perf_event_count(sub);
5833                 if (read_format & PERF_FORMAT_ID)
5834                         values[n++] = primary_event_id(sub);
5835
5836                 __output_copy(handle, values, n * sizeof(u64));
5837         }
5838 }
5839
5840 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5841                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
5842
5843 /*
5844  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
5845  *
5846  * The problem is that its both hard and excessively expensive to iterate the
5847  * child list, not to mention that its impossible to IPI the children running
5848  * on another CPU, from interrupt/NMI context.
5849  */
5850 static void perf_output_read(struct perf_output_handle *handle,
5851                              struct perf_event *event)
5852 {
5853         u64 enabled = 0, running = 0, now;
5854         u64 read_format = event->attr.read_format;
5855
5856         /*
5857          * compute total_time_enabled, total_time_running
5858          * based on snapshot values taken when the event
5859          * was last scheduled in.
5860          *
5861          * we cannot simply called update_context_time()
5862          * because of locking issue as we are called in
5863          * NMI context
5864          */
5865         if (read_format & PERF_FORMAT_TOTAL_TIMES)
5866                 calc_timer_values(event, &now, &enabled, &running);
5867
5868         if (event->attr.read_format & PERF_FORMAT_GROUP)
5869                 perf_output_read_group(handle, event, enabled, running);
5870         else
5871                 perf_output_read_one(handle, event, enabled, running);
5872 }
5873
5874 void perf_output_sample(struct perf_output_handle *handle,
5875                         struct perf_event_header *header,
5876                         struct perf_sample_data *data,
5877                         struct perf_event *event)
5878 {
5879         u64 sample_type = data->type;
5880
5881         perf_output_put(handle, *header);
5882
5883         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5884                 perf_output_put(handle, data->id);
5885
5886         if (sample_type & PERF_SAMPLE_IP)
5887                 perf_output_put(handle, data->ip);
5888
5889         if (sample_type & PERF_SAMPLE_TID)
5890                 perf_output_put(handle, data->tid_entry);
5891
5892         if (sample_type & PERF_SAMPLE_TIME)
5893                 perf_output_put(handle, data->time);
5894
5895         if (sample_type & PERF_SAMPLE_ADDR)
5896                 perf_output_put(handle, data->addr);
5897
5898         if (sample_type & PERF_SAMPLE_ID)
5899                 perf_output_put(handle, data->id);
5900
5901         if (sample_type & PERF_SAMPLE_STREAM_ID)
5902                 perf_output_put(handle, data->stream_id);
5903
5904         if (sample_type & PERF_SAMPLE_CPU)
5905                 perf_output_put(handle, data->cpu_entry);
5906
5907         if (sample_type & PERF_SAMPLE_PERIOD)
5908                 perf_output_put(handle, data->period);
5909
5910         if (sample_type & PERF_SAMPLE_READ)
5911                 perf_output_read(handle, event);
5912
5913         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5914                 if (data->callchain) {
5915                         int size = 1;
5916
5917                         if (data->callchain)
5918                                 size += data->callchain->nr;
5919
5920                         size *= sizeof(u64);
5921
5922                         __output_copy(handle, data->callchain, size);
5923                 } else {
5924                         u64 nr = 0;
5925                         perf_output_put(handle, nr);
5926                 }
5927         }
5928
5929         if (sample_type & PERF_SAMPLE_RAW) {
5930                 struct perf_raw_record *raw = data->raw;
5931
5932                 if (raw) {
5933                         struct perf_raw_frag *frag = &raw->frag;
5934
5935                         perf_output_put(handle, raw->size);
5936                         do {
5937                                 if (frag->copy) {
5938                                         __output_custom(handle, frag->copy,
5939                                                         frag->data, frag->size);
5940                                 } else {
5941                                         __output_copy(handle, frag->data,
5942                                                       frag->size);
5943                                 }
5944                                 if (perf_raw_frag_last(frag))
5945                                         break;
5946                                 frag = frag->next;
5947                         } while (1);
5948                         if (frag->pad)
5949                                 __output_skip(handle, NULL, frag->pad);
5950                 } else {
5951                         struct {
5952                                 u32     size;
5953                                 u32     data;
5954                         } raw = {
5955                                 .size = sizeof(u32),
5956                                 .data = 0,
5957                         };
5958                         perf_output_put(handle, raw);
5959                 }
5960         }
5961
5962         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5963                 if (data->br_stack) {
5964                         size_t size;
5965
5966                         size = data->br_stack->nr
5967                              * sizeof(struct perf_branch_entry);
5968
5969                         perf_output_put(handle, data->br_stack->nr);
5970                         perf_output_copy(handle, data->br_stack->entries, size);
5971                 } else {
5972                         /*
5973                          * we always store at least the value of nr
5974                          */
5975                         u64 nr = 0;
5976                         perf_output_put(handle, nr);
5977                 }
5978         }
5979
5980         if (sample_type & PERF_SAMPLE_REGS_USER) {
5981                 u64 abi = data->regs_user.abi;
5982
5983                 /*
5984                  * If there are no regs to dump, notice it through
5985                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5986                  */
5987                 perf_output_put(handle, abi);
5988
5989                 if (abi) {
5990                         u64 mask = event->attr.sample_regs_user;
5991                         perf_output_sample_regs(handle,
5992                                                 data->regs_user.regs,
5993                                                 mask);
5994                 }
5995         }
5996
5997         if (sample_type & PERF_SAMPLE_STACK_USER) {
5998                 perf_output_sample_ustack(handle,
5999                                           data->stack_user_size,
6000                                           data->regs_user.regs);
6001         }
6002
6003         if (sample_type & PERF_SAMPLE_WEIGHT)
6004                 perf_output_put(handle, data->weight);
6005
6006         if (sample_type & PERF_SAMPLE_DATA_SRC)
6007                 perf_output_put(handle, data->data_src.val);
6008
6009         if (sample_type & PERF_SAMPLE_TRANSACTION)
6010                 perf_output_put(handle, data->txn);
6011
6012         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6013                 u64 abi = data->regs_intr.abi;
6014                 /*
6015                  * If there are no regs to dump, notice it through
6016                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6017                  */
6018                 perf_output_put(handle, abi);
6019
6020                 if (abi) {
6021                         u64 mask = event->attr.sample_regs_intr;
6022
6023                         perf_output_sample_regs(handle,
6024                                                 data->regs_intr.regs,
6025                                                 mask);
6026                 }
6027         }
6028
6029         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6030                 perf_output_put(handle, data->phys_addr);
6031
6032         if (!event->attr.watermark) {
6033                 int wakeup_events = event->attr.wakeup_events;
6034
6035                 if (wakeup_events) {
6036                         struct ring_buffer *rb = handle->rb;
6037                         int events = local_inc_return(&rb->events);
6038
6039                         if (events >= wakeup_events) {
6040                                 local_sub(wakeup_events, &rb->events);
6041                                 local_inc(&rb->wakeup);
6042                         }
6043                 }
6044         }
6045 }
6046
6047 static u64 perf_virt_to_phys(u64 virt)
6048 {
6049         u64 phys_addr = 0;
6050         struct page *p = NULL;
6051
6052         if (!virt)
6053                 return 0;
6054
6055         if (virt >= TASK_SIZE) {
6056                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
6057                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6058                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
6059                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6060         } else {
6061                 /*
6062                  * Walking the pages tables for user address.
6063                  * Interrupts are disabled, so it prevents any tear down
6064                  * of the page tables.
6065                  * Try IRQ-safe __get_user_pages_fast first.
6066                  * If failed, leave phys_addr as 0.
6067                  */
6068                 if ((current->mm != NULL) &&
6069                     (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6070                         phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6071
6072                 if (p)
6073                         put_page(p);
6074         }
6075
6076         return phys_addr;
6077 }
6078
6079 void perf_prepare_sample(struct perf_event_header *header,
6080                          struct perf_sample_data *data,
6081                          struct perf_event *event,
6082                          struct pt_regs *regs)
6083 {
6084         u64 sample_type = event->attr.sample_type;
6085
6086         header->type = PERF_RECORD_SAMPLE;
6087         header->size = sizeof(*header) + event->header_size;
6088
6089         header->misc = 0;
6090         header->misc |= perf_misc_flags(regs);
6091
6092         __perf_event_header__init_id(header, data, event);
6093
6094         if (sample_type & PERF_SAMPLE_IP)
6095                 data->ip = perf_instruction_pointer(regs);
6096
6097         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6098                 int size = 1;
6099
6100                 data->callchain = perf_callchain(event, regs);
6101
6102                 if (data->callchain)
6103                         size += data->callchain->nr;
6104
6105                 header->size += size * sizeof(u64);
6106         }
6107
6108         if (sample_type & PERF_SAMPLE_RAW) {
6109                 struct perf_raw_record *raw = data->raw;
6110                 int size;
6111
6112                 if (raw) {
6113                         struct perf_raw_frag *frag = &raw->frag;
6114                         u32 sum = 0;
6115
6116                         do {
6117                                 sum += frag->size;
6118                                 if (perf_raw_frag_last(frag))
6119                                         break;
6120                                 frag = frag->next;
6121                         } while (1);
6122
6123                         size = round_up(sum + sizeof(u32), sizeof(u64));
6124                         raw->size = size - sizeof(u32);
6125                         frag->pad = raw->size - sum;
6126                 } else {
6127                         size = sizeof(u64);
6128                 }
6129
6130                 header->size += size;
6131         }
6132
6133         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6134                 int size = sizeof(u64); /* nr */
6135                 if (data->br_stack) {
6136                         size += data->br_stack->nr
6137                               * sizeof(struct perf_branch_entry);
6138                 }
6139                 header->size += size;
6140         }
6141
6142         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6143                 perf_sample_regs_user(&data->regs_user, regs,
6144                                       &data->regs_user_copy);
6145
6146         if (sample_type & PERF_SAMPLE_REGS_USER) {
6147                 /* regs dump ABI info */
6148                 int size = sizeof(u64);
6149
6150                 if (data->regs_user.regs) {
6151                         u64 mask = event->attr.sample_regs_user;
6152                         size += hweight64(mask) * sizeof(u64);
6153                 }
6154
6155                 header->size += size;
6156         }
6157
6158         if (sample_type & PERF_SAMPLE_STACK_USER) {
6159                 /*
6160                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
6161                  * processed as the last one or have additional check added
6162                  * in case new sample type is added, because we could eat
6163                  * up the rest of the sample size.
6164                  */
6165                 u16 stack_size = event->attr.sample_stack_user;
6166                 u16 size = sizeof(u64);
6167
6168                 stack_size = perf_sample_ustack_size(stack_size, header->size,
6169                                                      data->regs_user.regs);
6170
6171                 /*
6172                  * If there is something to dump, add space for the dump
6173                  * itself and for the field that tells the dynamic size,
6174                  * which is how many have been actually dumped.
6175                  */
6176                 if (stack_size)
6177                         size += sizeof(u64) + stack_size;
6178
6179                 data->stack_user_size = stack_size;
6180                 header->size += size;
6181         }
6182
6183         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6184                 /* regs dump ABI info */
6185                 int size = sizeof(u64);
6186
6187                 perf_sample_regs_intr(&data->regs_intr, regs);
6188
6189                 if (data->regs_intr.regs) {
6190                         u64 mask = event->attr.sample_regs_intr;
6191
6192                         size += hweight64(mask) * sizeof(u64);
6193                 }
6194
6195                 header->size += size;
6196         }
6197
6198         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6199                 data->phys_addr = perf_virt_to_phys(data->addr);
6200 }
6201
6202 static void __always_inline
6203 __perf_event_output(struct perf_event *event,
6204                     struct perf_sample_data *data,
6205                     struct pt_regs *regs,
6206                     int (*output_begin)(struct perf_output_handle *,
6207                                         struct perf_event *,
6208                                         unsigned int))
6209 {
6210         struct perf_output_handle handle;
6211         struct perf_event_header header;
6212
6213         /* protect the callchain buffers */
6214         rcu_read_lock();
6215
6216         perf_prepare_sample(&header, data, event, regs);
6217
6218         if (output_begin(&handle, event, header.size))
6219                 goto exit;
6220
6221         perf_output_sample(&handle, &header, data, event);
6222
6223         perf_output_end(&handle);
6224
6225 exit:
6226         rcu_read_unlock();
6227 }
6228
6229 void
6230 perf_event_output_forward(struct perf_event *event,
6231                          struct perf_sample_data *data,
6232                          struct pt_regs *regs)
6233 {
6234         __perf_event_output(event, data, regs, perf_output_begin_forward);
6235 }
6236
6237 void
6238 perf_event_output_backward(struct perf_event *event,
6239                            struct perf_sample_data *data,
6240                            struct pt_regs *regs)
6241 {
6242         __perf_event_output(event, data, regs, perf_output_begin_backward);
6243 }
6244
6245 void
6246 perf_event_output(struct perf_event *event,
6247                   struct perf_sample_data *data,
6248                   struct pt_regs *regs)
6249 {
6250         __perf_event_output(event, data, regs, perf_output_begin);
6251 }
6252
6253 /*
6254  * read event_id
6255  */
6256
6257 struct perf_read_event {
6258         struct perf_event_header        header;
6259
6260         u32                             pid;
6261         u32                             tid;
6262 };
6263
6264 static void
6265 perf_event_read_event(struct perf_event *event,
6266                         struct task_struct *task)
6267 {
6268         struct perf_output_handle handle;
6269         struct perf_sample_data sample;
6270         struct perf_read_event read_event = {
6271                 .header = {
6272                         .type = PERF_RECORD_READ,
6273                         .misc = 0,
6274                         .size = sizeof(read_event) + event->read_size,
6275                 },
6276                 .pid = perf_event_pid(event, task),
6277                 .tid = perf_event_tid(event, task),
6278         };
6279         int ret;
6280
6281         perf_event_header__init_id(&read_event.header, &sample, event);
6282         ret = perf_output_begin(&handle, event, read_event.header.size);
6283         if (ret)
6284                 return;
6285
6286         perf_output_put(&handle, read_event);
6287         perf_output_read(&handle, event);
6288         perf_event__output_id_sample(event, &handle, &sample);
6289
6290         perf_output_end(&handle);
6291 }
6292
6293 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6294
6295 static void
6296 perf_iterate_ctx(struct perf_event_context *ctx,
6297                    perf_iterate_f output,
6298                    void *data, bool all)
6299 {
6300         struct perf_event *event;
6301
6302         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6303                 if (!all) {
6304                         if (event->state < PERF_EVENT_STATE_INACTIVE)
6305                                 continue;
6306                         if (!event_filter_match(event))
6307                                 continue;
6308                 }
6309
6310                 output(event, data);
6311         }
6312 }
6313
6314 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6315 {
6316         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6317         struct perf_event *event;
6318
6319         list_for_each_entry_rcu(event, &pel->list, sb_list) {
6320                 /*
6321                  * Skip events that are not fully formed yet; ensure that
6322                  * if we observe event->ctx, both event and ctx will be
6323                  * complete enough. See perf_install_in_context().
6324                  */
6325                 if (!smp_load_acquire(&event->ctx))
6326                         continue;
6327
6328                 if (event->state < PERF_EVENT_STATE_INACTIVE)
6329                         continue;
6330                 if (!event_filter_match(event))
6331                         continue;
6332                 output(event, data);
6333         }
6334 }
6335
6336 /*
6337  * Iterate all events that need to receive side-band events.
6338  *
6339  * For new callers; ensure that account_pmu_sb_event() includes
6340  * your event, otherwise it might not get delivered.
6341  */
6342 static void
6343 perf_iterate_sb(perf_iterate_f output, void *data,
6344                struct perf_event_context *task_ctx)
6345 {
6346         struct perf_event_context *ctx;
6347         int ctxn;
6348
6349         rcu_read_lock();
6350         preempt_disable();
6351
6352         /*
6353          * If we have task_ctx != NULL we only notify the task context itself.
6354          * The task_ctx is set only for EXIT events before releasing task
6355          * context.
6356          */
6357         if (task_ctx) {
6358                 perf_iterate_ctx(task_ctx, output, data, false);
6359                 goto done;
6360         }
6361
6362         perf_iterate_sb_cpu(output, data);
6363
6364         for_each_task_context_nr(ctxn) {
6365                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6366                 if (ctx)
6367                         perf_iterate_ctx(ctx, output, data, false);
6368         }
6369 done:
6370         preempt_enable();
6371         rcu_read_unlock();
6372 }
6373
6374 /*
6375  * Clear all file-based filters at exec, they'll have to be
6376  * re-instated when/if these objects are mmapped again.
6377  */
6378 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6379 {
6380         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6381         struct perf_addr_filter *filter;
6382         unsigned int restart = 0, count = 0;
6383         unsigned long flags;
6384
6385         if (!has_addr_filter(event))
6386                 return;
6387
6388         raw_spin_lock_irqsave(&ifh->lock, flags);
6389         list_for_each_entry(filter, &ifh->list, entry) {
6390                 if (filter->inode) {
6391                         event->addr_filters_offs[count] = 0;
6392                         restart++;
6393                 }
6394
6395                 count++;
6396         }
6397
6398         if (restart)
6399                 event->addr_filters_gen++;
6400         raw_spin_unlock_irqrestore(&ifh->lock, flags);
6401
6402         if (restart)
6403                 perf_event_stop(event, 1);
6404 }
6405
6406 void perf_event_exec(void)
6407 {
6408         struct perf_event_context *ctx;
6409         int ctxn;
6410
6411         rcu_read_lock();
6412         for_each_task_context_nr(ctxn) {
6413                 ctx = current->perf_event_ctxp[ctxn];
6414                 if (!ctx)
6415                         continue;
6416
6417                 perf_event_enable_on_exec(ctxn);
6418
6419                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6420                                    true);
6421         }
6422         rcu_read_unlock();
6423 }
6424
6425 struct remote_output {
6426         struct ring_buffer      *rb;
6427         int                     err;
6428 };
6429
6430 static void __perf_event_output_stop(struct perf_event *event, void *data)
6431 {
6432         struct perf_event *parent = event->parent;
6433         struct remote_output *ro = data;
6434         struct ring_buffer *rb = ro->rb;
6435         struct stop_event_data sd = {
6436                 .event  = event,
6437         };
6438
6439         if (!has_aux(event))
6440                 return;
6441
6442         if (!parent)
6443                 parent = event;
6444
6445         /*
6446          * In case of inheritance, it will be the parent that links to the
6447          * ring-buffer, but it will be the child that's actually using it.
6448          *
6449          * We are using event::rb to determine if the event should be stopped,
6450          * however this may race with ring_buffer_attach() (through set_output),
6451          * which will make us skip the event that actually needs to be stopped.
6452          * So ring_buffer_attach() has to stop an aux event before re-assigning
6453          * its rb pointer.
6454          */
6455         if (rcu_dereference(parent->rb) == rb)
6456                 ro->err = __perf_event_stop(&sd);
6457 }
6458
6459 static int __perf_pmu_output_stop(void *info)
6460 {
6461         struct perf_event *event = info;
6462         struct pmu *pmu = event->pmu;
6463         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6464         struct remote_output ro = {
6465                 .rb     = event->rb,
6466         };
6467
6468         rcu_read_lock();
6469         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6470         if (cpuctx->task_ctx)
6471                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6472                                    &ro, false);
6473         rcu_read_unlock();
6474
6475         return ro.err;
6476 }
6477
6478 static void perf_pmu_output_stop(struct perf_event *event)
6479 {
6480         struct perf_event *iter;
6481         int err, cpu;
6482
6483 restart:
6484         rcu_read_lock();
6485         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6486                 /*
6487                  * For per-CPU events, we need to make sure that neither they
6488                  * nor their children are running; for cpu==-1 events it's
6489                  * sufficient to stop the event itself if it's active, since
6490                  * it can't have children.
6491                  */
6492                 cpu = iter->cpu;
6493                 if (cpu == -1)
6494                         cpu = READ_ONCE(iter->oncpu);
6495
6496                 if (cpu == -1)
6497                         continue;
6498
6499                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6500                 if (err == -EAGAIN) {
6501                         rcu_read_unlock();
6502                         goto restart;
6503                 }
6504         }
6505         rcu_read_unlock();
6506 }
6507
6508 /*
6509  * task tracking -- fork/exit
6510  *
6511  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6512  */
6513
6514 struct perf_task_event {
6515         struct task_struct              *task;
6516         struct perf_event_context       *task_ctx;
6517
6518         struct {
6519                 struct perf_event_header        header;
6520
6521                 u32                             pid;
6522                 u32                             ppid;
6523                 u32                             tid;
6524                 u32                             ptid;
6525                 u64                             time;
6526         } event_id;
6527 };
6528
6529 static int perf_event_task_match(struct perf_event *event)
6530 {
6531         return event->attr.comm  || event->attr.mmap ||
6532                event->attr.mmap2 || event->attr.mmap_data ||
6533                event->attr.task;
6534 }
6535
6536 static void perf_event_task_output(struct perf_event *event,
6537                                    void *data)
6538 {
6539         struct perf_task_event *task_event = data;
6540         struct perf_output_handle handle;
6541         struct perf_sample_data sample;
6542         struct task_struct *task = task_event->task;
6543         int ret, size = task_event->event_id.header.size;
6544
6545         if (!perf_event_task_match(event))
6546                 return;
6547
6548         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6549
6550         ret = perf_output_begin(&handle, event,
6551                                 task_event->event_id.header.size);
6552         if (ret)
6553                 goto out;
6554
6555         task_event->event_id.pid = perf_event_pid(event, task);
6556         task_event->event_id.ppid = perf_event_pid(event, current);
6557
6558         task_event->event_id.tid = perf_event_tid(event, task);
6559         task_event->event_id.ptid = perf_event_tid(event, current);
6560
6561         task_event->event_id.time = perf_event_clock(event);
6562
6563         perf_output_put(&handle, task_event->event_id);
6564
6565         perf_event__output_id_sample(event, &handle, &sample);
6566
6567         perf_output_end(&handle);
6568 out:
6569         task_event->event_id.header.size = size;
6570 }
6571
6572 static void perf_event_task(struct task_struct *task,
6573                               struct perf_event_context *task_ctx,
6574                               int new)
6575 {
6576         struct perf_task_event task_event;
6577
6578         if (!atomic_read(&nr_comm_events) &&
6579             !atomic_read(&nr_mmap_events) &&
6580             !atomic_read(&nr_task_events))
6581                 return;
6582
6583         task_event = (struct perf_task_event){
6584                 .task     = task,
6585                 .task_ctx = task_ctx,
6586                 .event_id    = {
6587                         .header = {
6588                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6589                                 .misc = 0,
6590                                 .size = sizeof(task_event.event_id),
6591                         },
6592                         /* .pid  */
6593                         /* .ppid */
6594                         /* .tid  */
6595                         /* .ptid */
6596                         /* .time */
6597                 },
6598         };
6599
6600         perf_iterate_sb(perf_event_task_output,
6601                        &task_event,
6602                        task_ctx);
6603 }
6604
6605 void perf_event_fork(struct task_struct *task)
6606 {
6607         perf_event_task(task, NULL, 1);
6608         perf_event_namespaces(task);
6609 }
6610
6611 /*
6612  * comm tracking
6613  */
6614
6615 struct perf_comm_event {
6616         struct task_struct      *task;
6617         char                    *comm;
6618         int                     comm_size;
6619
6620         struct {
6621                 struct perf_event_header        header;
6622
6623                 u32                             pid;
6624                 u32                             tid;
6625         } event_id;
6626 };
6627
6628 static int perf_event_comm_match(struct perf_event *event)
6629 {
6630         return event->attr.comm;
6631 }
6632
6633 static void perf_event_comm_output(struct perf_event *event,
6634                                    void *data)
6635 {
6636         struct perf_comm_event *comm_event = data;
6637         struct perf_output_handle handle;
6638         struct perf_sample_data sample;
6639         int size = comm_event->event_id.header.size;
6640         int ret;
6641
6642         if (!perf_event_comm_match(event))
6643                 return;
6644
6645         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6646         ret = perf_output_begin(&handle, event,
6647                                 comm_event->event_id.header.size);
6648
6649         if (ret)
6650                 goto out;
6651
6652         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6653         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6654
6655         perf_output_put(&handle, comm_event->event_id);
6656         __output_copy(&handle, comm_event->comm,
6657                                    comm_event->comm_size);
6658
6659         perf_event__output_id_sample(event, &handle, &sample);
6660
6661         perf_output_end(&handle);
6662 out:
6663         comm_event->event_id.header.size = size;
6664 }
6665
6666 static void perf_event_comm_event(struct perf_comm_event *comm_event)
6667 {
6668         char comm[TASK_COMM_LEN];
6669         unsigned int size;
6670
6671         memset(comm, 0, sizeof(comm));
6672         strlcpy(comm, comm_event->task->comm, sizeof(comm));
6673         size = ALIGN(strlen(comm)+1, sizeof(u64));
6674
6675         comm_event->comm = comm;
6676         comm_event->comm_size = size;
6677
6678         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6679
6680         perf_iterate_sb(perf_event_comm_output,
6681                        comm_event,
6682                        NULL);
6683 }
6684
6685 void perf_event_comm(struct task_struct *task, bool exec)
6686 {
6687         struct perf_comm_event comm_event;
6688
6689         if (!atomic_read(&nr_comm_events))
6690                 return;
6691
6692         comm_event = (struct perf_comm_event){
6693                 .task   = task,
6694                 /* .comm      */
6695                 /* .comm_size */
6696                 .event_id  = {
6697                         .header = {
6698                                 .type = PERF_RECORD_COMM,
6699                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6700                                 /* .size */
6701                         },
6702                         /* .pid */
6703                         /* .tid */
6704                 },
6705         };
6706
6707         perf_event_comm_event(&comm_event);
6708 }
6709
6710 /*
6711  * namespaces tracking
6712  */
6713
6714 struct perf_namespaces_event {
6715         struct task_struct              *task;
6716
6717         struct {
6718                 struct perf_event_header        header;
6719
6720                 u32                             pid;
6721                 u32                             tid;
6722                 u64                             nr_namespaces;
6723                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
6724         } event_id;
6725 };
6726
6727 static int perf_event_namespaces_match(struct perf_event *event)
6728 {
6729         return event->attr.namespaces;
6730 }
6731
6732 static void perf_event_namespaces_output(struct perf_event *event,
6733                                          void *data)
6734 {
6735         struct perf_namespaces_event *namespaces_event = data;
6736         struct perf_output_handle handle;
6737         struct perf_sample_data sample;
6738         u16 header_size = namespaces_event->event_id.header.size;
6739         int ret;
6740
6741         if (!perf_event_namespaces_match(event))
6742                 return;
6743
6744         perf_event_header__init_id(&namespaces_event->event_id.header,
6745                                    &sample, event);
6746         ret = perf_output_begin(&handle, event,
6747                                 namespaces_event->event_id.header.size);
6748         if (ret)
6749                 goto out;
6750
6751         namespaces_event->event_id.pid = perf_event_pid(event,
6752                                                         namespaces_event->task);
6753         namespaces_event->event_id.tid = perf_event_tid(event,
6754                                                         namespaces_event->task);
6755
6756         perf_output_put(&handle, namespaces_event->event_id);
6757
6758         perf_event__output_id_sample(event, &handle, &sample);
6759
6760         perf_output_end(&handle);
6761 out:
6762         namespaces_event->event_id.header.size = header_size;
6763 }
6764
6765 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
6766                                    struct task_struct *task,
6767                                    const struct proc_ns_operations *ns_ops)
6768 {
6769         struct path ns_path;
6770         struct inode *ns_inode;
6771         void *error;
6772
6773         error = ns_get_path(&ns_path, task, ns_ops);
6774         if (!error) {
6775                 ns_inode = ns_path.dentry->d_inode;
6776                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
6777                 ns_link_info->ino = ns_inode->i_ino;
6778                 path_put(&ns_path);
6779         }
6780 }
6781
6782 void perf_event_namespaces(struct task_struct *task)
6783 {
6784         struct perf_namespaces_event namespaces_event;
6785         struct perf_ns_link_info *ns_link_info;
6786
6787         if (!atomic_read(&nr_namespaces_events))
6788                 return;
6789
6790         namespaces_event = (struct perf_namespaces_event){
6791                 .task   = task,
6792                 .event_id  = {
6793                         .header = {
6794                                 .type = PERF_RECORD_NAMESPACES,
6795                                 .misc = 0,
6796                                 .size = sizeof(namespaces_event.event_id),
6797                         },
6798                         /* .pid */
6799                         /* .tid */
6800                         .nr_namespaces = NR_NAMESPACES,
6801                         /* .link_info[NR_NAMESPACES] */
6802                 },
6803         };
6804
6805         ns_link_info = namespaces_event.event_id.link_info;
6806
6807         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
6808                                task, &mntns_operations);
6809
6810 #ifdef CONFIG_USER_NS
6811         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
6812                                task, &userns_operations);
6813 #endif
6814 #ifdef CONFIG_NET_NS
6815         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
6816                                task, &netns_operations);
6817 #endif
6818 #ifdef CONFIG_UTS_NS
6819         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
6820                                task, &utsns_operations);
6821 #endif
6822 #ifdef CONFIG_IPC_NS
6823         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
6824                                task, &ipcns_operations);
6825 #endif
6826 #ifdef CONFIG_PID_NS
6827         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
6828                                task, &pidns_operations);
6829 #endif
6830 #ifdef CONFIG_CGROUPS
6831         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
6832                                task, &cgroupns_operations);
6833 #endif
6834
6835         perf_iterate_sb(perf_event_namespaces_output,
6836                         &namespaces_event,
6837                         NULL);
6838 }
6839
6840 /*
6841  * mmap tracking
6842  */
6843
6844 struct perf_mmap_event {
6845         struct vm_area_struct   *vma;
6846
6847         const char              *file_name;
6848         int                     file_size;
6849         int                     maj, min;
6850         u64                     ino;
6851         u64                     ino_generation;
6852         u32                     prot, flags;
6853
6854         struct {
6855                 struct perf_event_header        header;
6856
6857                 u32                             pid;
6858                 u32                             tid;
6859                 u64                             start;
6860                 u64                             len;
6861                 u64                             pgoff;
6862         } event_id;
6863 };
6864
6865 static int perf_event_mmap_match(struct perf_event *event,
6866                                  void *data)
6867 {
6868         struct perf_mmap_event *mmap_event = data;
6869         struct vm_area_struct *vma = mmap_event->vma;
6870         int executable = vma->vm_flags & VM_EXEC;
6871
6872         return (!executable && event->attr.mmap_data) ||
6873                (executable && (event->attr.mmap || event->attr.mmap2));
6874 }
6875
6876 static void perf_event_mmap_output(struct perf_event *event,
6877                                    void *data)
6878 {
6879         struct perf_mmap_event *mmap_event = data;
6880         struct perf_output_handle handle;
6881         struct perf_sample_data sample;
6882         int size = mmap_event->event_id.header.size;
6883         int ret;
6884
6885         if (!perf_event_mmap_match(event, data))
6886                 return;
6887
6888         if (event->attr.mmap2) {
6889                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6890                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6891                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6892                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6893                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6894                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6895                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6896         }
6897
6898         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6899         ret = perf_output_begin(&handle, event,
6900                                 mmap_event->event_id.header.size);
6901         if (ret)
6902                 goto out;
6903
6904         mmap_event->event_id.pid = perf_event_pid(event, current);
6905         mmap_event->event_id.tid = perf_event_tid(event, current);
6906
6907         perf_output_put(&handle, mmap_event->event_id);
6908
6909         if (event->attr.mmap2) {
6910                 perf_output_put(&handle, mmap_event->maj);
6911                 perf_output_put(&handle, mmap_event->min);
6912                 perf_output_put(&handle, mmap_event->ino);
6913                 perf_output_put(&handle, mmap_event->ino_generation);
6914                 perf_output_put(&handle, mmap_event->prot);
6915                 perf_output_put(&handle, mmap_event->flags);
6916         }
6917
6918         __output_copy(&handle, mmap_event->file_name,
6919                                    mmap_event->file_size);
6920
6921         perf_event__output_id_sample(event, &handle, &sample);
6922
6923         perf_output_end(&handle);
6924 out:
6925         mmap_event->event_id.header.size = size;
6926 }
6927
6928 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6929 {
6930         struct vm_area_struct *vma = mmap_event->vma;
6931         struct file *file = vma->vm_file;
6932         int maj = 0, min = 0;
6933         u64 ino = 0, gen = 0;
6934         u32 prot = 0, flags = 0;
6935         unsigned int size;
6936         char tmp[16];
6937         char *buf = NULL;
6938         char *name;
6939
6940         if (vma->vm_flags & VM_READ)
6941                 prot |= PROT_READ;
6942         if (vma->vm_flags & VM_WRITE)
6943                 prot |= PROT_WRITE;
6944         if (vma->vm_flags & VM_EXEC)
6945                 prot |= PROT_EXEC;
6946
6947         if (vma->vm_flags & VM_MAYSHARE)
6948                 flags = MAP_SHARED;
6949         else
6950                 flags = MAP_PRIVATE;
6951
6952         if (vma->vm_flags & VM_DENYWRITE)
6953                 flags |= MAP_DENYWRITE;
6954         if (vma->vm_flags & VM_MAYEXEC)
6955                 flags |= MAP_EXECUTABLE;
6956         if (vma->vm_flags & VM_LOCKED)
6957                 flags |= MAP_LOCKED;
6958         if (vma->vm_flags & VM_HUGETLB)
6959                 flags |= MAP_HUGETLB;
6960
6961         if (file) {
6962                 struct inode *inode;
6963                 dev_t dev;
6964
6965                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6966                 if (!buf) {
6967                         name = "//enomem";
6968                         goto cpy_name;
6969                 }
6970                 /*
6971                  * d_path() works from the end of the rb backwards, so we
6972                  * need to add enough zero bytes after the string to handle
6973                  * the 64bit alignment we do later.
6974                  */
6975                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6976                 if (IS_ERR(name)) {
6977                         name = "//toolong";
6978                         goto cpy_name;
6979                 }
6980                 inode = file_inode(vma->vm_file);
6981                 dev = inode->i_sb->s_dev;
6982                 ino = inode->i_ino;
6983                 gen = inode->i_generation;
6984                 maj = MAJOR(dev);
6985                 min = MINOR(dev);
6986
6987                 goto got_name;
6988         } else {
6989                 if (vma->vm_ops && vma->vm_ops->name) {
6990                         name = (char *) vma->vm_ops->name(vma);
6991                         if (name)
6992                                 goto cpy_name;
6993                 }
6994
6995                 name = (char *)arch_vma_name(vma);
6996                 if (name)
6997                         goto cpy_name;
6998
6999                 if (vma->vm_start <= vma->vm_mm->start_brk &&
7000                                 vma->vm_end >= vma->vm_mm->brk) {
7001                         name = "[heap]";
7002                         goto cpy_name;
7003                 }
7004                 if (vma->vm_start <= vma->vm_mm->start_stack &&
7005                                 vma->vm_end >= vma->vm_mm->start_stack) {
7006                         name = "[stack]";
7007                         goto cpy_name;
7008                 }
7009
7010                 name = "//anon";
7011                 goto cpy_name;
7012         }
7013
7014 cpy_name:
7015         strlcpy(tmp, name, sizeof(tmp));
7016         name = tmp;
7017 got_name:
7018         /*
7019          * Since our buffer works in 8 byte units we need to align our string
7020          * size to a multiple of 8. However, we must guarantee the tail end is
7021          * zero'd out to avoid leaking random bits to userspace.
7022          */
7023         size = strlen(name)+1;
7024         while (!IS_ALIGNED(size, sizeof(u64)))
7025                 name[size++] = '\0';
7026
7027         mmap_event->file_name = name;
7028         mmap_event->file_size = size;
7029         mmap_event->maj = maj;
7030         mmap_event->min = min;
7031         mmap_event->ino = ino;
7032         mmap_event->ino_generation = gen;
7033         mmap_event->prot = prot;
7034         mmap_event->flags = flags;
7035
7036         if (!(vma->vm_flags & VM_EXEC))
7037                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7038
7039         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7040
7041         perf_iterate_sb(perf_event_mmap_output,
7042                        mmap_event,
7043                        NULL);
7044
7045         kfree(buf);
7046 }
7047
7048 /*
7049  * Check whether inode and address range match filter criteria.
7050  */
7051 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7052                                      struct file *file, unsigned long offset,
7053                                      unsigned long size)
7054 {
7055         if (filter->inode != file_inode(file))
7056                 return false;
7057
7058         if (filter->offset > offset + size)
7059                 return false;
7060
7061         if (filter->offset + filter->size < offset)
7062                 return false;
7063
7064         return true;
7065 }
7066
7067 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7068 {
7069         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7070         struct vm_area_struct *vma = data;
7071         unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
7072         struct file *file = vma->vm_file;
7073         struct perf_addr_filter *filter;
7074         unsigned int restart = 0, count = 0;
7075
7076         if (!has_addr_filter(event))
7077                 return;
7078
7079         if (!file)
7080                 return;
7081
7082         raw_spin_lock_irqsave(&ifh->lock, flags);
7083         list_for_each_entry(filter, &ifh->list, entry) {
7084                 if (perf_addr_filter_match(filter, file, off,
7085                                              vma->vm_end - vma->vm_start)) {
7086                         event->addr_filters_offs[count] = vma->vm_start;
7087                         restart++;
7088                 }
7089
7090                 count++;
7091         }
7092
7093         if (restart)
7094                 event->addr_filters_gen++;
7095         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7096
7097         if (restart)
7098                 perf_event_stop(event, 1);
7099 }
7100
7101 /*
7102  * Adjust all task's events' filters to the new vma
7103  */
7104 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7105 {
7106         struct perf_event_context *ctx;
7107         int ctxn;
7108
7109         /*
7110          * Data tracing isn't supported yet and as such there is no need
7111          * to keep track of anything that isn't related to executable code:
7112          */
7113         if (!(vma->vm_flags & VM_EXEC))
7114                 return;
7115
7116         rcu_read_lock();
7117         for_each_task_context_nr(ctxn) {
7118                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7119                 if (!ctx)
7120                         continue;
7121
7122                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7123         }
7124         rcu_read_unlock();
7125 }
7126
7127 void perf_event_mmap(struct vm_area_struct *vma)
7128 {
7129         struct perf_mmap_event mmap_event;
7130
7131         if (!atomic_read(&nr_mmap_events))
7132                 return;
7133
7134         mmap_event = (struct perf_mmap_event){
7135                 .vma    = vma,
7136                 /* .file_name */
7137                 /* .file_size */
7138                 .event_id  = {
7139                         .header = {
7140                                 .type = PERF_RECORD_MMAP,
7141                                 .misc = PERF_RECORD_MISC_USER,
7142                                 /* .size */
7143                         },
7144                         /* .pid */
7145                         /* .tid */
7146                         .start  = vma->vm_start,
7147                         .len    = vma->vm_end - vma->vm_start,
7148                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
7149                 },
7150                 /* .maj (attr_mmap2 only) */
7151                 /* .min (attr_mmap2 only) */
7152                 /* .ino (attr_mmap2 only) */
7153                 /* .ino_generation (attr_mmap2 only) */
7154                 /* .prot (attr_mmap2 only) */
7155                 /* .flags (attr_mmap2 only) */
7156         };
7157
7158         perf_addr_filters_adjust(vma);
7159         perf_event_mmap_event(&mmap_event);
7160 }
7161
7162 void perf_event_aux_event(struct perf_event *event, unsigned long head,
7163                           unsigned long size, u64 flags)
7164 {
7165         struct perf_output_handle handle;
7166         struct perf_sample_data sample;
7167         struct perf_aux_event {
7168                 struct perf_event_header        header;
7169                 u64                             offset;
7170                 u64                             size;
7171                 u64                             flags;
7172         } rec = {
7173                 .header = {
7174                         .type = PERF_RECORD_AUX,
7175                         .misc = 0,
7176                         .size = sizeof(rec),
7177                 },
7178                 .offset         = head,
7179                 .size           = size,
7180                 .flags          = flags,
7181         };
7182         int ret;
7183
7184         perf_event_header__init_id(&rec.header, &sample, event);
7185         ret = perf_output_begin(&handle, event, rec.header.size);
7186
7187         if (ret)
7188                 return;
7189
7190         perf_output_put(&handle, rec);
7191         perf_event__output_id_sample(event, &handle, &sample);
7192
7193         perf_output_end(&handle);
7194 }
7195
7196 /*
7197  * Lost/dropped samples logging
7198  */
7199 void perf_log_lost_samples(struct perf_event *event, u64 lost)
7200 {
7201         struct perf_output_handle handle;
7202         struct perf_sample_data sample;
7203         int ret;
7204
7205         struct {
7206                 struct perf_event_header        header;
7207                 u64                             lost;
7208         } lost_samples_event = {
7209                 .header = {
7210                         .type = PERF_RECORD_LOST_SAMPLES,
7211                         .misc = 0,
7212                         .size = sizeof(lost_samples_event),
7213                 },
7214                 .lost           = lost,
7215         };
7216
7217         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7218
7219         ret = perf_output_begin(&handle, event,
7220                                 lost_samples_event.header.size);
7221         if (ret)
7222                 return;
7223
7224         perf_output_put(&handle, lost_samples_event);
7225         perf_event__output_id_sample(event, &handle, &sample);
7226         perf_output_end(&handle);
7227 }
7228
7229 /*
7230  * context_switch tracking
7231  */
7232
7233 struct perf_switch_event {
7234         struct task_struct      *task;
7235         struct task_struct      *next_prev;
7236
7237         struct {
7238                 struct perf_event_header        header;
7239                 u32                             next_prev_pid;
7240                 u32                             next_prev_tid;
7241         } event_id;
7242 };
7243
7244 static int perf_event_switch_match(struct perf_event *event)
7245 {
7246         return event->attr.context_switch;
7247 }
7248
7249 static void perf_event_switch_output(struct perf_event *event, void *data)
7250 {
7251         struct perf_switch_event *se = data;
7252         struct perf_output_handle handle;
7253         struct perf_sample_data sample;
7254         int ret;
7255
7256         if (!perf_event_switch_match(event))
7257                 return;
7258
7259         /* Only CPU-wide events are allowed to see next/prev pid/tid */
7260         if (event->ctx->task) {
7261                 se->event_id.header.type = PERF_RECORD_SWITCH;
7262                 se->event_id.header.size = sizeof(se->event_id.header);
7263         } else {
7264                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7265                 se->event_id.header.size = sizeof(se->event_id);
7266                 se->event_id.next_prev_pid =
7267                                         perf_event_pid(event, se->next_prev);
7268                 se->event_id.next_prev_tid =
7269                                         perf_event_tid(event, se->next_prev);
7270         }
7271
7272         perf_event_header__init_id(&se->event_id.header, &sample, event);
7273
7274         ret = perf_output_begin(&handle, event, se->event_id.header.size);
7275         if (ret)
7276                 return;
7277
7278         if (event->ctx->task)
7279                 perf_output_put(&handle, se->event_id.header);
7280         else
7281                 perf_output_put(&handle, se->event_id);
7282
7283         perf_event__output_id_sample(event, &handle, &sample);
7284
7285         perf_output_end(&handle);
7286 }
7287
7288 static void perf_event_switch(struct task_struct *task,
7289                               struct task_struct *next_prev, bool sched_in)
7290 {
7291         struct perf_switch_event switch_event;
7292
7293         /* N.B. caller checks nr_switch_events != 0 */
7294
7295         switch_event = (struct perf_switch_event){
7296                 .task           = task,
7297                 .next_prev      = next_prev,
7298                 .event_id       = {
7299                         .header = {
7300                                 /* .type */
7301                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7302                                 /* .size */
7303                         },
7304                         /* .next_prev_pid */
7305                         /* .next_prev_tid */
7306                 },
7307         };
7308
7309         perf_iterate_sb(perf_event_switch_output,
7310                        &switch_event,
7311                        NULL);
7312 }
7313
7314 /*
7315  * IRQ throttle logging
7316  */
7317
7318 static void perf_log_throttle(struct perf_event *event, int enable)
7319 {
7320         struct perf_output_handle handle;
7321         struct perf_sample_data sample;
7322         int ret;
7323
7324         struct {
7325                 struct perf_event_header        header;
7326                 u64                             time;
7327                 u64                             id;
7328                 u64                             stream_id;
7329         } throttle_event = {
7330                 .header = {
7331                         .type = PERF_RECORD_THROTTLE,
7332                         .misc = 0,
7333                         .size = sizeof(throttle_event),
7334                 },
7335                 .time           = perf_event_clock(event),
7336                 .id             = primary_event_id(event),
7337                 .stream_id      = event->id,
7338         };
7339
7340         if (enable)
7341                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7342
7343         perf_event_header__init_id(&throttle_event.header, &sample, event);
7344
7345         ret = perf_output_begin(&handle, event,
7346                                 throttle_event.header.size);
7347         if (ret)
7348                 return;
7349
7350         perf_output_put(&handle, throttle_event);
7351         perf_event__output_id_sample(event, &handle, &sample);
7352         perf_output_end(&handle);
7353 }
7354
7355 void perf_event_itrace_started(struct perf_event *event)
7356 {
7357         event->attach_state |= PERF_ATTACH_ITRACE;
7358 }
7359
7360 static void perf_log_itrace_start(struct perf_event *event)
7361 {
7362         struct perf_output_handle handle;
7363         struct perf_sample_data sample;
7364         struct perf_aux_event {
7365                 struct perf_event_header        header;
7366                 u32                             pid;
7367                 u32                             tid;
7368         } rec;
7369         int ret;
7370
7371         if (event->parent)
7372                 event = event->parent;
7373
7374         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7375             event->attach_state & PERF_ATTACH_ITRACE)
7376                 return;
7377
7378         rec.header.type = PERF_RECORD_ITRACE_START;
7379         rec.header.misc = 0;
7380         rec.header.size = sizeof(rec);
7381         rec.pid = perf_event_pid(event, current);
7382         rec.tid = perf_event_tid(event, current);
7383
7384         perf_event_header__init_id(&rec.header, &sample, event);
7385         ret = perf_output_begin(&handle, event, rec.header.size);
7386
7387         if (ret)
7388                 return;
7389
7390         perf_output_put(&handle, rec);
7391         perf_event__output_id_sample(event, &handle, &sample);
7392
7393         perf_output_end(&handle);
7394 }
7395
7396 static int
7397 __perf_event_account_interrupt(struct perf_event *event, int throttle)
7398 {
7399         struct hw_perf_event *hwc = &event->hw;
7400         int ret = 0;
7401         u64 seq;
7402
7403         seq = __this_cpu_read(perf_throttled_seq);
7404         if (seq != hwc->interrupts_seq) {
7405                 hwc->interrupts_seq = seq;
7406                 hwc->interrupts = 1;
7407         } else {
7408                 hwc->interrupts++;
7409                 if (unlikely(throttle
7410                              && hwc->interrupts >= max_samples_per_tick)) {
7411                         __this_cpu_inc(perf_throttled_count);
7412                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7413                         hwc->interrupts = MAX_INTERRUPTS;
7414                         perf_log_throttle(event, 0);
7415                         ret = 1;
7416                 }
7417         }
7418
7419         if (event->attr.freq) {
7420                 u64 now = perf_clock();
7421                 s64 delta = now - hwc->freq_time_stamp;
7422
7423                 hwc->freq_time_stamp = now;
7424
7425                 if (delta > 0 && delta < 2*TICK_NSEC)
7426                         perf_adjust_period(event, delta, hwc->last_period, true);
7427         }
7428
7429         return ret;
7430 }
7431
7432 int perf_event_account_interrupt(struct perf_event *event)
7433 {
7434         return __perf_event_account_interrupt(event, 1);
7435 }
7436
7437 /*
7438  * Generic event overflow handling, sampling.
7439  */
7440
7441 static int __perf_event_overflow(struct perf_event *event,
7442                                    int throttle, struct perf_sample_data *data,
7443                                    struct pt_regs *regs)
7444 {
7445         int events = atomic_read(&event->event_limit);
7446         int ret = 0;
7447
7448         /*
7449          * Non-sampling counters might still use the PMI to fold short
7450          * hardware counters, ignore those.
7451          */
7452         if (unlikely(!is_sampling_event(event)))
7453                 return 0;
7454
7455         ret = __perf_event_account_interrupt(event, throttle);
7456
7457         /*
7458          * XXX event_limit might not quite work as expected on inherited
7459          * events
7460          */
7461
7462         event->pending_kill = POLL_IN;
7463         if (events && atomic_dec_and_test(&event->event_limit)) {
7464                 ret = 1;
7465                 event->pending_kill = POLL_HUP;
7466
7467                 perf_event_disable_inatomic(event);
7468         }
7469
7470         READ_ONCE(event->overflow_handler)(event, data, regs);
7471
7472         if (*perf_event_fasync(event) && event->pending_kill) {
7473                 event->pending_wakeup = 1;
7474                 irq_work_queue(&event->pending);
7475         }
7476
7477         return ret;
7478 }
7479
7480 int perf_event_overflow(struct perf_event *event,
7481                           struct perf_sample_data *data,
7482                           struct pt_regs *regs)
7483 {
7484         return __perf_event_overflow(event, 1, data, regs);
7485 }
7486
7487 /*
7488  * Generic software event infrastructure
7489  */
7490
7491 struct swevent_htable {
7492         struct swevent_hlist            *swevent_hlist;
7493         struct mutex                    hlist_mutex;
7494         int                             hlist_refcount;
7495
7496         /* Recursion avoidance in each contexts */
7497         int                             recursion[PERF_NR_CONTEXTS];
7498 };
7499
7500 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7501
7502 /*
7503  * We directly increment event->count and keep a second value in
7504  * event->hw.period_left to count intervals. This period event
7505  * is kept in the range [-sample_period, 0] so that we can use the
7506  * sign as trigger.
7507  */
7508
7509 u64 perf_swevent_set_period(struct perf_event *event)
7510 {
7511         struct hw_perf_event *hwc = &event->hw;
7512         u64 period = hwc->last_period;
7513         u64 nr, offset;
7514         s64 old, val;
7515
7516         hwc->last_period = hwc->sample_period;
7517
7518 again:
7519         old = val = local64_read(&hwc->period_left);
7520         if (val < 0)
7521                 return 0;
7522
7523         nr = div64_u64(period + val, period);
7524         offset = nr * period;
7525         val -= offset;
7526         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7527                 goto again;
7528
7529         return nr;
7530 }
7531
7532 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7533                                     struct perf_sample_data *data,
7534                                     struct pt_regs *regs)
7535 {
7536         struct hw_perf_event *hwc = &event->hw;
7537         int throttle = 0;
7538
7539         if (!overflow)
7540                 overflow = perf_swevent_set_period(event);
7541
7542         if (hwc->interrupts == MAX_INTERRUPTS)
7543                 return;
7544
7545         for (; overflow; overflow--) {
7546                 if (__perf_event_overflow(event, throttle,
7547                                             data, regs)) {
7548                         /*
7549                          * We inhibit the overflow from happening when
7550                          * hwc->interrupts == MAX_INTERRUPTS.
7551                          */
7552                         break;
7553                 }
7554                 throttle = 1;
7555         }
7556 }
7557
7558 static void perf_swevent_event(struct perf_event *event, u64 nr,
7559                                struct perf_sample_data *data,
7560                                struct pt_regs *regs)
7561 {
7562         struct hw_perf_event *hwc = &event->hw;
7563
7564         local64_add(nr, &event->count);
7565
7566         if (!regs)
7567                 return;
7568
7569         if (!is_sampling_event(event))
7570                 return;
7571
7572         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7573                 data->period = nr;
7574                 return perf_swevent_overflow(event, 1, data, regs);
7575         } else
7576                 data->period = event->hw.last_period;
7577
7578         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7579                 return perf_swevent_overflow(event, 1, data, regs);
7580
7581         if (local64_add_negative(nr, &hwc->period_left))
7582                 return;
7583
7584         perf_swevent_overflow(event, 0, data, regs);
7585 }
7586
7587 static int perf_exclude_event(struct perf_event *event,
7588                               struct pt_regs *regs)
7589 {
7590         if (event->hw.state & PERF_HES_STOPPED)
7591                 return 1;
7592
7593         if (regs) {
7594                 if (event->attr.exclude_user && user_mode(regs))
7595                         return 1;
7596
7597                 if (event->attr.exclude_kernel && !user_mode(regs))
7598                         return 1;
7599         }
7600
7601         return 0;
7602 }
7603
7604 static int perf_swevent_match(struct perf_event *event,
7605                                 enum perf_type_id type,
7606                                 u32 event_id,
7607                                 struct perf_sample_data *data,
7608                                 struct pt_regs *regs)
7609 {
7610         if (event->attr.type != type)
7611                 return 0;
7612
7613         if (event->attr.config != event_id)
7614                 return 0;
7615
7616         if (perf_exclude_event(event, regs))
7617                 return 0;
7618
7619         return 1;
7620 }
7621
7622 static inline u64 swevent_hash(u64 type, u32 event_id)
7623 {
7624         u64 val = event_id | (type << 32);
7625
7626         return hash_64(val, SWEVENT_HLIST_BITS);
7627 }
7628
7629 static inline struct hlist_head *
7630 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7631 {
7632         u64 hash = swevent_hash(type, event_id);
7633
7634         return &hlist->heads[hash];
7635 }
7636
7637 /* For the read side: events when they trigger */
7638 static inline struct hlist_head *
7639 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7640 {
7641         struct swevent_hlist *hlist;
7642
7643         hlist = rcu_dereference(swhash->swevent_hlist);
7644         if (!hlist)
7645                 return NULL;
7646
7647         return __find_swevent_head(hlist, type, event_id);
7648 }
7649
7650 /* For the event head insertion and removal in the hlist */
7651 static inline struct hlist_head *
7652 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7653 {
7654         struct swevent_hlist *hlist;
7655         u32 event_id = event->attr.config;
7656         u64 type = event->attr.type;
7657
7658         /*
7659          * Event scheduling is always serialized against hlist allocation
7660          * and release. Which makes the protected version suitable here.
7661          * The context lock guarantees that.
7662          */
7663         hlist = rcu_dereference_protected(swhash->swevent_hlist,
7664                                           lockdep_is_held(&event->ctx->lock));
7665         if (!hlist)
7666                 return NULL;
7667
7668         return __find_swevent_head(hlist, type, event_id);
7669 }
7670
7671 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7672                                     u64 nr,
7673                                     struct perf_sample_data *data,
7674                                     struct pt_regs *regs)
7675 {
7676         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7677         struct perf_event *event;
7678         struct hlist_head *head;
7679
7680         rcu_read_lock();
7681         head = find_swevent_head_rcu(swhash, type, event_id);
7682         if (!head)
7683                 goto end;
7684
7685         hlist_for_each_entry_rcu(event, head, hlist_entry) {
7686                 if (perf_swevent_match(event, type, event_id, data, regs))
7687                         perf_swevent_event(event, nr, data, regs);
7688         }
7689 end:
7690         rcu_read_unlock();
7691 }
7692
7693 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7694
7695 int perf_swevent_get_recursion_context(void)
7696 {
7697         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7698
7699         return get_recursion_context(swhash->recursion);
7700 }
7701 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7702
7703 void perf_swevent_put_recursion_context(int rctx)
7704 {
7705         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7706
7707         put_recursion_context(swhash->recursion, rctx);
7708 }
7709
7710 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7711 {
7712         struct perf_sample_data data;
7713
7714         if (WARN_ON_ONCE(!regs))
7715                 return;
7716
7717         perf_sample_data_init(&data, addr, 0);
7718         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
7719 }
7720
7721 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7722 {
7723         int rctx;
7724
7725         preempt_disable_notrace();
7726         rctx = perf_swevent_get_recursion_context();
7727         if (unlikely(rctx < 0))
7728                 goto fail;
7729
7730         ___perf_sw_event(event_id, nr, regs, addr);
7731
7732         perf_swevent_put_recursion_context(rctx);
7733 fail:
7734         preempt_enable_notrace();
7735 }
7736
7737 static void perf_swevent_read(struct perf_event *event)
7738 {
7739 }
7740
7741 static int perf_swevent_add(struct perf_event *event, int flags)
7742 {
7743         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7744         struct hw_perf_event *hwc = &event->hw;
7745         struct hlist_head *head;
7746
7747         if (is_sampling_event(event)) {
7748                 hwc->last_period = hwc->sample_period;
7749                 perf_swevent_set_period(event);
7750         }
7751
7752         hwc->state = !(flags & PERF_EF_START);
7753
7754         head = find_swevent_head(swhash, event);
7755         if (WARN_ON_ONCE(!head))
7756                 return -EINVAL;
7757
7758         hlist_add_head_rcu(&event->hlist_entry, head);
7759         perf_event_update_userpage(event);
7760
7761         return 0;
7762 }
7763
7764 static void perf_swevent_del(struct perf_event *event, int flags)
7765 {
7766         hlist_del_rcu(&event->hlist_entry);
7767 }
7768
7769 static void perf_swevent_start(struct perf_event *event, int flags)
7770 {
7771         event->hw.state = 0;
7772 }
7773
7774 static void perf_swevent_stop(struct perf_event *event, int flags)
7775 {
7776         event->hw.state = PERF_HES_STOPPED;
7777 }
7778
7779 /* Deref the hlist from the update side */
7780 static inline struct swevent_hlist *
7781 swevent_hlist_deref(struct swevent_htable *swhash)
7782 {
7783         return rcu_dereference_protected(swhash->swevent_hlist,
7784                                          lockdep_is_held(&swhash->hlist_mutex));
7785 }
7786
7787 static void swevent_hlist_release(struct swevent_htable *swhash)
7788 {
7789         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
7790
7791         if (!hlist)
7792                 return;
7793
7794         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
7795         kfree_rcu(hlist, rcu_head);
7796 }
7797
7798 static void swevent_hlist_put_cpu(int cpu)
7799 {
7800         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7801
7802         mutex_lock(&swhash->hlist_mutex);
7803
7804         if (!--swhash->hlist_refcount)
7805                 swevent_hlist_release(swhash);
7806
7807         mutex_unlock(&swhash->hlist_mutex);
7808 }
7809
7810 static void swevent_hlist_put(void)
7811 {
7812         int cpu;
7813
7814         for_each_possible_cpu(cpu)
7815                 swevent_hlist_put_cpu(cpu);
7816 }
7817
7818 static int swevent_hlist_get_cpu(int cpu)
7819 {
7820         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7821         int err = 0;
7822
7823         mutex_lock(&swhash->hlist_mutex);
7824         if (!swevent_hlist_deref(swhash) &&
7825             cpumask_test_cpu(cpu, perf_online_mask)) {
7826                 struct swevent_hlist *hlist;
7827
7828                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7829                 if (!hlist) {
7830                         err = -ENOMEM;
7831                         goto exit;
7832                 }
7833                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7834         }
7835         swhash->hlist_refcount++;
7836 exit:
7837         mutex_unlock(&swhash->hlist_mutex);
7838
7839         return err;
7840 }
7841
7842 static int swevent_hlist_get(void)
7843 {
7844         int err, cpu, failed_cpu;
7845
7846         mutex_lock(&pmus_lock);
7847         for_each_possible_cpu(cpu) {
7848                 err = swevent_hlist_get_cpu(cpu);
7849                 if (err) {
7850                         failed_cpu = cpu;
7851                         goto fail;
7852                 }
7853         }
7854         mutex_unlock(&pmus_lock);
7855         return 0;
7856 fail:
7857         for_each_possible_cpu(cpu) {
7858                 if (cpu == failed_cpu)
7859                         break;
7860                 swevent_hlist_put_cpu(cpu);
7861         }
7862         mutex_unlock(&pmus_lock);
7863         return err;
7864 }
7865
7866 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
7867
7868 static void sw_perf_event_destroy(struct perf_event *event)
7869 {
7870         u64 event_id = event->attr.config;
7871
7872         WARN_ON(event->parent);
7873
7874         static_key_slow_dec(&perf_swevent_enabled[event_id]);
7875         swevent_hlist_put();
7876 }
7877
7878 static int perf_swevent_init(struct perf_event *event)
7879 {
7880         u64 event_id = event->attr.config;
7881
7882         if (event->attr.type != PERF_TYPE_SOFTWARE)
7883                 return -ENOENT;
7884
7885         /*
7886          * no branch sampling for software events
7887          */
7888         if (has_branch_stack(event))
7889                 return -EOPNOTSUPP;
7890
7891         switch (event_id) {
7892         case PERF_COUNT_SW_CPU_CLOCK:
7893         case PERF_COUNT_SW_TASK_CLOCK:
7894                 return -ENOENT;
7895
7896         default:
7897                 break;
7898         }
7899
7900         if (event_id >= PERF_COUNT_SW_MAX)
7901                 return -ENOENT;
7902
7903         if (!event->parent) {
7904                 int err;
7905
7906                 err = swevent_hlist_get();
7907                 if (err)
7908                         return err;
7909
7910                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
7911                 event->destroy = sw_perf_event_destroy;
7912         }
7913
7914         return 0;
7915 }
7916
7917 static struct pmu perf_swevent = {
7918         .task_ctx_nr    = perf_sw_context,
7919
7920         .capabilities   = PERF_PMU_CAP_NO_NMI,
7921
7922         .event_init     = perf_swevent_init,
7923         .add            = perf_swevent_add,
7924         .del            = perf_swevent_del,
7925         .start          = perf_swevent_start,
7926         .stop           = perf_swevent_stop,
7927         .read           = perf_swevent_read,
7928 };
7929
7930 #ifdef CONFIG_EVENT_TRACING
7931
7932 static int perf_tp_filter_match(struct perf_event *event,
7933                                 struct perf_sample_data *data)
7934 {
7935         void *record = data->raw->frag.data;
7936
7937         /* only top level events have filters set */
7938         if (event->parent)
7939                 event = event->parent;
7940
7941         if (likely(!event->filter) || filter_match_preds(event->filter, record))
7942                 return 1;
7943         return 0;
7944 }
7945
7946 static int perf_tp_event_match(struct perf_event *event,
7947                                 struct perf_sample_data *data,
7948                                 struct pt_regs *regs)
7949 {
7950         if (event->hw.state & PERF_HES_STOPPED)
7951                 return 0;
7952         /*
7953          * All tracepoints are from kernel-space.
7954          */
7955         if (event->attr.exclude_kernel)
7956                 return 0;
7957
7958         if (!perf_tp_filter_match(event, data))
7959                 return 0;
7960
7961         return 1;
7962 }
7963
7964 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
7965                                struct trace_event_call *call, u64 count,
7966                                struct pt_regs *regs, struct hlist_head *head,
7967                                struct task_struct *task)
7968 {
7969         struct bpf_prog *prog = call->prog;
7970
7971         if (prog) {
7972                 *(struct pt_regs **)raw_data = regs;
7973                 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
7974                         perf_swevent_put_recursion_context(rctx);
7975                         return;
7976                 }
7977         }
7978         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
7979                       rctx, task, NULL);
7980 }
7981 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
7982
7983 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
7984                    struct pt_regs *regs, struct hlist_head *head, int rctx,
7985                    struct task_struct *task, struct perf_event *event)
7986 {
7987         struct perf_sample_data data;
7988
7989         struct perf_raw_record raw = {
7990                 .frag = {
7991                         .size = entry_size,
7992                         .data = record,
7993                 },
7994         };
7995
7996         perf_sample_data_init(&data, 0, 0);
7997         data.raw = &raw;
7998
7999         perf_trace_buf_update(record, event_type);
8000
8001         /* Use the given event instead of the hlist */
8002         if (event) {
8003                 if (perf_tp_event_match(event, &data, regs))
8004                         perf_swevent_event(event, count, &data, regs);
8005         } else {
8006                 hlist_for_each_entry_rcu(event, head, hlist_entry) {
8007                         if (perf_tp_event_match(event, &data, regs))
8008                                 perf_swevent_event(event, count, &data, regs);
8009                 }
8010         }
8011
8012         /*
8013          * If we got specified a target task, also iterate its context and
8014          * deliver this event there too.
8015          */
8016         if (task && task != current) {
8017                 struct perf_event_context *ctx;
8018                 struct trace_entry *entry = record;
8019
8020                 rcu_read_lock();
8021                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8022                 if (!ctx)
8023                         goto unlock;
8024
8025                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8026                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8027                                 continue;
8028                         if (event->attr.config != entry->type)
8029                                 continue;
8030                         if (perf_tp_event_match(event, &data, regs))
8031                                 perf_swevent_event(event, count, &data, regs);
8032                 }
8033 unlock:
8034                 rcu_read_unlock();
8035         }
8036
8037         perf_swevent_put_recursion_context(rctx);
8038 }
8039 EXPORT_SYMBOL_GPL(perf_tp_event);
8040
8041 static void tp_perf_event_destroy(struct perf_event *event)
8042 {
8043         perf_trace_destroy(event);
8044 }
8045
8046 static int perf_tp_event_init(struct perf_event *event)
8047 {
8048         int err;
8049
8050         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8051                 return -ENOENT;
8052
8053         /*
8054          * no branch sampling for tracepoint events
8055          */
8056         if (has_branch_stack(event))
8057                 return -EOPNOTSUPP;
8058
8059         err = perf_trace_init(event);
8060         if (err)
8061                 return err;
8062
8063         event->destroy = tp_perf_event_destroy;
8064
8065         return 0;
8066 }
8067
8068 static struct pmu perf_tracepoint = {
8069         .task_ctx_nr    = perf_sw_context,
8070
8071         .event_init     = perf_tp_event_init,
8072         .add            = perf_trace_add,
8073         .del            = perf_trace_del,
8074         .start          = perf_swevent_start,
8075         .stop           = perf_swevent_stop,
8076         .read           = perf_swevent_read,
8077 };
8078
8079 static inline void perf_tp_register(void)
8080 {
8081         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8082 }
8083
8084 static void perf_event_free_filter(struct perf_event *event)
8085 {
8086         ftrace_profile_free_filter(event);
8087 }
8088
8089 #ifdef CONFIG_BPF_SYSCALL
8090 static void bpf_overflow_handler(struct perf_event *event,
8091                                  struct perf_sample_data *data,
8092                                  struct pt_regs *regs)
8093 {
8094         struct bpf_perf_event_data_kern ctx = {
8095                 .data = data,
8096                 .regs = regs,
8097         };
8098         int ret = 0;
8099
8100         preempt_disable();
8101         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8102                 goto out;
8103         rcu_read_lock();
8104         ret = BPF_PROG_RUN(event->prog, &ctx);
8105         rcu_read_unlock();
8106 out:
8107         __this_cpu_dec(bpf_prog_active);
8108         preempt_enable();
8109         if (!ret)
8110                 return;
8111
8112         event->orig_overflow_handler(event, data, regs);
8113 }
8114
8115 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8116 {
8117         struct bpf_prog *prog;
8118
8119         if (event->overflow_handler_context)
8120                 /* hw breakpoint or kernel counter */
8121                 return -EINVAL;
8122
8123         if (event->prog)
8124                 return -EEXIST;
8125
8126         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8127         if (IS_ERR(prog))
8128                 return PTR_ERR(prog);
8129
8130         event->prog = prog;
8131         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8132         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8133         return 0;
8134 }
8135
8136 static void perf_event_free_bpf_handler(struct perf_event *event)
8137 {
8138         struct bpf_prog *prog = event->prog;
8139
8140         if (!prog)
8141                 return;
8142
8143         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8144         event->prog = NULL;
8145         bpf_prog_put(prog);
8146 }
8147 #else
8148 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8149 {
8150         return -EOPNOTSUPP;
8151 }
8152 static void perf_event_free_bpf_handler(struct perf_event *event)
8153 {
8154 }
8155 #endif
8156
8157 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8158 {
8159         bool is_kprobe, is_tracepoint, is_syscall_tp;
8160         struct bpf_prog *prog;
8161
8162         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8163                 return perf_event_set_bpf_handler(event, prog_fd);
8164
8165         if (event->tp_event->prog)
8166                 return -EEXIST;
8167
8168         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8169         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8170         is_syscall_tp = is_syscall_trace_event(event->tp_event);
8171         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8172                 /* bpf programs can only be attached to u/kprobe or tracepoint */
8173                 return -EINVAL;
8174
8175         prog = bpf_prog_get(prog_fd);
8176         if (IS_ERR(prog))
8177                 return PTR_ERR(prog);
8178
8179         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8180             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8181             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8182                 /* valid fd, but invalid bpf program type */
8183                 bpf_prog_put(prog);
8184                 return -EINVAL;
8185         }
8186
8187         if (is_tracepoint || is_syscall_tp) {
8188                 int off = trace_event_get_offsets(event->tp_event);
8189
8190                 if (prog->aux->max_ctx_offset > off) {
8191                         bpf_prog_put(prog);
8192                         return -EACCES;
8193                 }
8194         }
8195         event->tp_event->prog = prog;
8196         event->tp_event->bpf_prog_owner = event;
8197
8198         return 0;
8199 }
8200
8201 static void perf_event_free_bpf_prog(struct perf_event *event)
8202 {
8203         struct bpf_prog *prog;
8204
8205         perf_event_free_bpf_handler(event);
8206
8207         if (!event->tp_event)
8208                 return;
8209
8210         prog = event->tp_event->prog;
8211         if (prog && event->tp_event->bpf_prog_owner == event) {
8212                 event->tp_event->prog = NULL;
8213                 bpf_prog_put(prog);
8214         }
8215 }
8216
8217 #else
8218
8219 static inline void perf_tp_register(void)
8220 {
8221 }
8222
8223 static void perf_event_free_filter(struct perf_event *event)
8224 {
8225 }
8226
8227 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8228 {
8229         return -ENOENT;
8230 }
8231
8232 static void perf_event_free_bpf_prog(struct perf_event *event)
8233 {
8234 }
8235 #endif /* CONFIG_EVENT_TRACING */
8236
8237 #ifdef CONFIG_HAVE_HW_BREAKPOINT
8238 void perf_bp_event(struct perf_event *bp, void *data)
8239 {
8240         struct perf_sample_data sample;
8241         struct pt_regs *regs = data;
8242
8243         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8244
8245         if (!bp->hw.state && !perf_exclude_event(bp, regs))
8246                 perf_swevent_event(bp, 1, &sample, regs);
8247 }
8248 #endif
8249
8250 /*
8251  * Allocate a new address filter
8252  */
8253 static struct perf_addr_filter *
8254 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8255 {
8256         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8257         struct perf_addr_filter *filter;
8258
8259         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8260         if (!filter)
8261                 return NULL;
8262
8263         INIT_LIST_HEAD(&filter->entry);
8264         list_add_tail(&filter->entry, filters);
8265
8266         return filter;
8267 }
8268
8269 static void free_filters_list(struct list_head *filters)
8270 {
8271         struct perf_addr_filter *filter, *iter;
8272
8273         list_for_each_entry_safe(filter, iter, filters, entry) {
8274                 if (filter->inode)
8275                         iput(filter->inode);
8276                 list_del(&filter->entry);
8277                 kfree(filter);
8278         }
8279 }
8280
8281 /*
8282  * Free existing address filters and optionally install new ones
8283  */
8284 static void perf_addr_filters_splice(struct perf_event *event,
8285                                      struct list_head *head)
8286 {
8287         unsigned long flags;
8288         LIST_HEAD(list);
8289
8290         if (!has_addr_filter(event))
8291                 return;
8292
8293         /* don't bother with children, they don't have their own filters */
8294         if (event->parent)
8295                 return;
8296
8297         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8298
8299         list_splice_init(&event->addr_filters.list, &list);
8300         if (head)
8301                 list_splice(head, &event->addr_filters.list);
8302
8303         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8304
8305         free_filters_list(&list);
8306 }
8307
8308 /*
8309  * Scan through mm's vmas and see if one of them matches the
8310  * @filter; if so, adjust filter's address range.
8311  * Called with mm::mmap_sem down for reading.
8312  */
8313 static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8314                                             struct mm_struct *mm)
8315 {
8316         struct vm_area_struct *vma;
8317
8318         for (vma = mm->mmap; vma; vma = vma->vm_next) {
8319                 struct file *file = vma->vm_file;
8320                 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8321                 unsigned long vma_size = vma->vm_end - vma->vm_start;
8322
8323                 if (!file)
8324                         continue;
8325
8326                 if (!perf_addr_filter_match(filter, file, off, vma_size))
8327                         continue;
8328
8329                 return vma->vm_start;
8330         }
8331
8332         return 0;
8333 }
8334
8335 /*
8336  * Update event's address range filters based on the
8337  * task's existing mappings, if any.
8338  */
8339 static void perf_event_addr_filters_apply(struct perf_event *event)
8340 {
8341         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8342         struct task_struct *task = READ_ONCE(event->ctx->task);
8343         struct perf_addr_filter *filter;
8344         struct mm_struct *mm = NULL;
8345         unsigned int count = 0;
8346         unsigned long flags;
8347
8348         /*
8349          * We may observe TASK_TOMBSTONE, which means that the event tear-down
8350          * will stop on the parent's child_mutex that our caller is also holding
8351          */
8352         if (task == TASK_TOMBSTONE)
8353                 return;
8354
8355         if (!ifh->nr_file_filters)
8356                 return;
8357
8358         mm = get_task_mm(event->ctx->task);
8359         if (!mm)
8360                 goto restart;
8361
8362         down_read(&mm->mmap_sem);
8363
8364         raw_spin_lock_irqsave(&ifh->lock, flags);
8365         list_for_each_entry(filter, &ifh->list, entry) {
8366                 event->addr_filters_offs[count] = 0;
8367
8368                 /*
8369                  * Adjust base offset if the filter is associated to a binary
8370                  * that needs to be mapped:
8371                  */
8372                 if (filter->inode)
8373                         event->addr_filters_offs[count] =
8374                                 perf_addr_filter_apply(filter, mm);
8375
8376                 count++;
8377         }
8378
8379         event->addr_filters_gen++;
8380         raw_spin_unlock_irqrestore(&ifh->lock, flags);
8381
8382         up_read(&mm->mmap_sem);
8383
8384         mmput(mm);
8385
8386 restart:
8387         perf_event_stop(event, 1);
8388 }
8389
8390 /*
8391  * Address range filtering: limiting the data to certain
8392  * instruction address ranges. Filters are ioctl()ed to us from
8393  * userspace as ascii strings.
8394  *
8395  * Filter string format:
8396  *
8397  * ACTION RANGE_SPEC
8398  * where ACTION is one of the
8399  *  * "filter": limit the trace to this region
8400  *  * "start": start tracing from this address
8401  *  * "stop": stop tracing at this address/region;
8402  * RANGE_SPEC is
8403  *  * for kernel addresses: <start address>[/<size>]
8404  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
8405  *
8406  * if <size> is not specified, the range is treated as a single address.
8407  */
8408 enum {
8409         IF_ACT_NONE = -1,
8410         IF_ACT_FILTER,
8411         IF_ACT_START,
8412         IF_ACT_STOP,
8413         IF_SRC_FILE,
8414         IF_SRC_KERNEL,
8415         IF_SRC_FILEADDR,
8416         IF_SRC_KERNELADDR,
8417 };
8418
8419 enum {
8420         IF_STATE_ACTION = 0,
8421         IF_STATE_SOURCE,
8422         IF_STATE_END,
8423 };
8424
8425 static const match_table_t if_tokens = {
8426         { IF_ACT_FILTER,        "filter" },
8427         { IF_ACT_START,         "start" },
8428         { IF_ACT_STOP,          "stop" },
8429         { IF_SRC_FILE,          "%u/%u@%s" },
8430         { IF_SRC_KERNEL,        "%u/%u" },
8431         { IF_SRC_FILEADDR,      "%u@%s" },
8432         { IF_SRC_KERNELADDR,    "%u" },
8433         { IF_ACT_NONE,          NULL },
8434 };
8435
8436 /*
8437  * Address filter string parser
8438  */
8439 static int
8440 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8441                              struct list_head *filters)
8442 {
8443         struct perf_addr_filter *filter = NULL;
8444         char *start, *orig, *filename = NULL;
8445         struct path path;
8446         substring_t args[MAX_OPT_ARGS];
8447         int state = IF_STATE_ACTION, token;
8448         unsigned int kernel = 0;
8449         int ret = -EINVAL;
8450
8451         orig = fstr = kstrdup(fstr, GFP_KERNEL);
8452         if (!fstr)
8453                 return -ENOMEM;
8454
8455         while ((start = strsep(&fstr, " ,\n")) != NULL) {
8456                 ret = -EINVAL;
8457
8458                 if (!*start)
8459                         continue;
8460
8461                 /* filter definition begins */
8462                 if (state == IF_STATE_ACTION) {
8463                         filter = perf_addr_filter_new(event, filters);
8464                         if (!filter)
8465                                 goto fail;
8466                 }
8467
8468                 token = match_token(start, if_tokens, args);
8469                 switch (token) {
8470                 case IF_ACT_FILTER:
8471                 case IF_ACT_START:
8472                         filter->filter = 1;
8473
8474                 case IF_ACT_STOP:
8475                         if (state != IF_STATE_ACTION)
8476                                 goto fail;
8477
8478                         state = IF_STATE_SOURCE;
8479                         break;
8480
8481                 case IF_SRC_KERNELADDR:
8482                 case IF_SRC_KERNEL:
8483                         kernel = 1;
8484
8485                 case IF_SRC_FILEADDR:
8486                 case IF_SRC_FILE:
8487                         if (state != IF_STATE_SOURCE)
8488                                 goto fail;
8489
8490                         if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8491                                 filter->range = 1;
8492
8493                         *args[0].to = 0;
8494                         ret = kstrtoul(args[0].from, 0, &filter->offset);
8495                         if (ret)
8496                                 goto fail;
8497
8498                         if (filter->range) {
8499                                 *args[1].to = 0;
8500                                 ret = kstrtoul(args[1].from, 0, &filter->size);
8501                                 if (ret)
8502                                         goto fail;
8503                         }
8504
8505                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8506                                 int fpos = filter->range ? 2 : 1;
8507
8508                                 filename = match_strdup(&args[fpos]);
8509                                 if (!filename) {
8510                                         ret = -ENOMEM;
8511                                         goto fail;
8512                                 }
8513                         }
8514
8515                         state = IF_STATE_END;
8516                         break;
8517
8518                 default:
8519                         goto fail;
8520                 }
8521
8522                 /*
8523                  * Filter definition is fully parsed, validate and install it.
8524                  * Make sure that it doesn't contradict itself or the event's
8525                  * attribute.
8526                  */
8527                 if (state == IF_STATE_END) {
8528                         ret = -EINVAL;
8529                         if (kernel && event->attr.exclude_kernel)
8530                                 goto fail;
8531
8532                         if (!kernel) {
8533                                 if (!filename)
8534                                         goto fail;
8535
8536                                 /*
8537                                  * For now, we only support file-based filters
8538                                  * in per-task events; doing so for CPU-wide
8539                                  * events requires additional context switching
8540                                  * trickery, since same object code will be
8541                                  * mapped at different virtual addresses in
8542                                  * different processes.
8543                                  */
8544                                 ret = -EOPNOTSUPP;
8545                                 if (!event->ctx->task)
8546                                         goto fail_free_name;
8547
8548                                 /* look up the path and grab its inode */
8549                                 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8550                                 if (ret)
8551                                         goto fail_free_name;
8552
8553                                 filter->inode = igrab(d_inode(path.dentry));
8554                                 path_put(&path);
8555                                 kfree(filename);
8556                                 filename = NULL;
8557
8558                                 ret = -EINVAL;
8559                                 if (!filter->inode ||
8560                                     !S_ISREG(filter->inode->i_mode))
8561                                         /* free_filters_list() will iput() */
8562                                         goto fail;
8563
8564                                 event->addr_filters.nr_file_filters++;
8565                         }
8566
8567                         /* ready to consume more filters */
8568                         state = IF_STATE_ACTION;
8569                         filter = NULL;
8570                 }
8571         }
8572
8573         if (state != IF_STATE_ACTION)
8574                 goto fail;
8575
8576         kfree(orig);
8577
8578         return 0;
8579
8580 fail_free_name:
8581         kfree(filename);
8582 fail:
8583         free_filters_list(filters);
8584         kfree(orig);
8585
8586         return ret;
8587 }
8588
8589 static int
8590 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8591 {
8592         LIST_HEAD(filters);
8593         int ret;
8594
8595         /*
8596          * Since this is called in perf_ioctl() path, we're already holding
8597          * ctx::mutex.
8598          */
8599         lockdep_assert_held(&event->ctx->mutex);
8600
8601         if (WARN_ON_ONCE(event->parent))
8602                 return -EINVAL;
8603
8604         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8605         if (ret)
8606                 goto fail_clear_files;
8607
8608         ret = event->pmu->addr_filters_validate(&filters);
8609         if (ret)
8610                 goto fail_free_filters;
8611
8612         /* remove existing filters, if any */
8613         perf_addr_filters_splice(event, &filters);
8614
8615         /* install new filters */
8616         perf_event_for_each_child(event, perf_event_addr_filters_apply);
8617
8618         return ret;
8619
8620 fail_free_filters:
8621         free_filters_list(&filters);
8622
8623 fail_clear_files:
8624         event->addr_filters.nr_file_filters = 0;
8625
8626         return ret;
8627 }
8628
8629 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8630 {
8631         char *filter_str;
8632         int ret = -EINVAL;
8633
8634         if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8635             !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8636             !has_addr_filter(event))
8637                 return -EINVAL;
8638
8639         filter_str = strndup_user(arg, PAGE_SIZE);
8640         if (IS_ERR(filter_str))
8641                 return PTR_ERR(filter_str);
8642
8643         if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8644             event->attr.type == PERF_TYPE_TRACEPOINT)
8645                 ret = ftrace_profile_set_filter(event, event->attr.config,
8646                                                 filter_str);
8647         else if (has_addr_filter(event))
8648                 ret = perf_event_set_addr_filter(event, filter_str);
8649
8650         kfree(filter_str);
8651         return ret;
8652 }
8653
8654 /*
8655  * hrtimer based swevent callback
8656  */
8657
8658 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
8659 {
8660         enum hrtimer_restart ret = HRTIMER_RESTART;
8661         struct perf_sample_data data;
8662         struct pt_regs *regs;
8663         struct perf_event *event;
8664         u64 period;
8665
8666         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
8667
8668         if (event->state != PERF_EVENT_STATE_ACTIVE)
8669                 return HRTIMER_NORESTART;
8670
8671         event->pmu->read(event);
8672
8673         perf_sample_data_init(&data, 0, event->hw.last_period);
8674         regs = get_irq_regs();
8675
8676         if (regs && !perf_exclude_event(event, regs)) {
8677                 if (!(event->attr.exclude_idle && is_idle_task(current)))
8678                         if (__perf_event_overflow(event, 1, &data, regs))
8679                                 ret = HRTIMER_NORESTART;
8680         }
8681
8682         period = max_t(u64, 10000, event->hw.sample_period);
8683         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
8684
8685         return ret;
8686 }
8687
8688 static void perf_swevent_start_hrtimer(struct perf_event *event)
8689 {
8690         struct hw_perf_event *hwc = &event->hw;
8691         s64 period;
8692
8693         if (!is_sampling_event(event))
8694                 return;
8695
8696         period = local64_read(&hwc->period_left);
8697         if (period) {
8698                 if (period < 0)
8699                         period = 10000;
8700
8701                 local64_set(&hwc->period_left, 0);
8702         } else {
8703                 period = max_t(u64, 10000, hwc->sample_period);
8704         }
8705         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8706                       HRTIMER_MODE_REL_PINNED);
8707 }
8708
8709 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
8710 {
8711         struct hw_perf_event *hwc = &event->hw;
8712
8713         if (is_sampling_event(event)) {
8714                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
8715                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
8716
8717                 hrtimer_cancel(&hwc->hrtimer);
8718         }
8719 }
8720
8721 static void perf_swevent_init_hrtimer(struct perf_event *event)
8722 {
8723         struct hw_perf_event *hwc = &event->hw;
8724
8725         if (!is_sampling_event(event))
8726                 return;
8727
8728         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8729         hwc->hrtimer.function = perf_swevent_hrtimer;
8730
8731         /*
8732          * Since hrtimers have a fixed rate, we can do a static freq->period
8733          * mapping and avoid the whole period adjust feedback stuff.
8734          */
8735         if (event->attr.freq) {
8736                 long freq = event->attr.sample_freq;
8737
8738                 event->attr.sample_period = NSEC_PER_SEC / freq;
8739                 hwc->sample_period = event->attr.sample_period;
8740                 local64_set(&hwc->period_left, hwc->sample_period);
8741                 hwc->last_period = hwc->sample_period;
8742                 event->attr.freq = 0;
8743         }
8744 }
8745
8746 /*
8747  * Software event: cpu wall time clock
8748  */
8749
8750 static void cpu_clock_event_update(struct perf_event *event)
8751 {
8752         s64 prev;
8753         u64 now;
8754
8755         now = local_clock();
8756         prev = local64_xchg(&event->hw.prev_count, now);
8757         local64_add(now - prev, &event->count);
8758 }
8759
8760 static void cpu_clock_event_start(struct perf_event *event, int flags)
8761 {
8762         local64_set(&event->hw.prev_count, local_clock());
8763         perf_swevent_start_hrtimer(event);
8764 }
8765
8766 static void cpu_clock_event_stop(struct perf_event *event, int flags)
8767 {
8768         perf_swevent_cancel_hrtimer(event);
8769         cpu_clock_event_update(event);
8770 }
8771
8772 static int cpu_clock_event_add(struct perf_event *event, int flags)
8773 {
8774         if (flags & PERF_EF_START)
8775                 cpu_clock_event_start(event, flags);
8776         perf_event_update_userpage(event);
8777
8778         return 0;
8779 }
8780
8781 static void cpu_clock_event_del(struct perf_event *event, int flags)
8782 {
8783         cpu_clock_event_stop(event, flags);
8784 }
8785
8786 static void cpu_clock_event_read(struct perf_event *event)
8787 {
8788         cpu_clock_event_update(event);
8789 }
8790
8791 static int cpu_clock_event_init(struct perf_event *event)
8792 {
8793         if (event->attr.type != PERF_TYPE_SOFTWARE)
8794                 return -ENOENT;
8795
8796         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8797                 return -ENOENT;
8798
8799         /*
8800          * no branch sampling for software events
8801          */
8802         if (has_branch_stack(event))
8803                 return -EOPNOTSUPP;
8804
8805         perf_swevent_init_hrtimer(event);
8806
8807         return 0;
8808 }
8809
8810 static struct pmu perf_cpu_clock = {
8811         .task_ctx_nr    = perf_sw_context,
8812
8813         .capabilities   = PERF_PMU_CAP_NO_NMI,
8814
8815         .event_init     = cpu_clock_event_init,
8816         .add            = cpu_clock_event_add,
8817         .del            = cpu_clock_event_del,
8818         .start          = cpu_clock_event_start,
8819         .stop           = cpu_clock_event_stop,
8820         .read           = cpu_clock_event_read,
8821 };
8822
8823 /*
8824  * Software event: task time clock
8825  */
8826
8827 static void task_clock_event_update(struct perf_event *event, u64 now)
8828 {
8829         u64 prev;
8830         s64 delta;
8831
8832         prev = local64_xchg(&event->hw.prev_count, now);
8833         delta = now - prev;
8834         local64_add(delta, &event->count);
8835 }
8836
8837 static void task_clock_event_start(struct perf_event *event, int flags)
8838 {
8839         local64_set(&event->hw.prev_count, event->ctx->time);
8840         perf_swevent_start_hrtimer(event);
8841 }
8842
8843 static void task_clock_event_stop(struct perf_event *event, int flags)
8844 {
8845         perf_swevent_cancel_hrtimer(event);
8846         task_clock_event_update(event, event->ctx->time);
8847 }
8848
8849 static int task_clock_event_add(struct perf_event *event, int flags)
8850 {
8851         if (flags & PERF_EF_START)
8852                 task_clock_event_start(event, flags);
8853         perf_event_update_userpage(event);
8854
8855         return 0;
8856 }
8857
8858 static void task_clock_event_del(struct perf_event *event, int flags)
8859 {
8860         task_clock_event_stop(event, PERF_EF_UPDATE);
8861 }
8862
8863 static void task_clock_event_read(struct perf_event *event)
8864 {
8865         u64 now = perf_clock();
8866         u64 delta = now - event->ctx->timestamp;
8867         u64 time = event->ctx->time + delta;
8868
8869         task_clock_event_update(event, time);
8870 }
8871
8872 static int task_clock_event_init(struct perf_event *event)
8873 {
8874         if (event->attr.type != PERF_TYPE_SOFTWARE)
8875                 return -ENOENT;
8876
8877         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8878                 return -ENOENT;
8879
8880         /*
8881          * no branch sampling for software events
8882          */
8883         if (has_branch_stack(event))
8884                 return -EOPNOTSUPP;
8885
8886         perf_swevent_init_hrtimer(event);
8887
8888         return 0;
8889 }
8890
8891 static struct pmu perf_task_clock = {
8892         .task_ctx_nr    = perf_sw_context,
8893
8894         .capabilities   = PERF_PMU_CAP_NO_NMI,
8895
8896         .event_init     = task_clock_event_init,
8897         .add            = task_clock_event_add,
8898         .del            = task_clock_event_del,
8899         .start          = task_clock_event_start,
8900         .stop           = task_clock_event_stop,
8901         .read           = task_clock_event_read,
8902 };
8903
8904 static void perf_pmu_nop_void(struct pmu *pmu)
8905 {
8906 }
8907
8908 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8909 {
8910 }
8911
8912 static int perf_pmu_nop_int(struct pmu *pmu)
8913 {
8914         return 0;
8915 }
8916
8917 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
8918
8919 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
8920 {
8921         __this_cpu_write(nop_txn_flags, flags);
8922
8923         if (flags & ~PERF_PMU_TXN_ADD)
8924                 return;
8925
8926         perf_pmu_disable(pmu);
8927 }
8928
8929 static int perf_pmu_commit_txn(struct pmu *pmu)
8930 {
8931         unsigned int flags = __this_cpu_read(nop_txn_flags);
8932
8933         __this_cpu_write(nop_txn_flags, 0);
8934
8935         if (flags & ~PERF_PMU_TXN_ADD)
8936                 return 0;
8937
8938         perf_pmu_enable(pmu);
8939         return 0;
8940 }
8941
8942 static void perf_pmu_cancel_txn(struct pmu *pmu)
8943 {
8944         unsigned int flags =  __this_cpu_read(nop_txn_flags);
8945
8946         __this_cpu_write(nop_txn_flags, 0);
8947
8948         if (flags & ~PERF_PMU_TXN_ADD)
8949                 return;
8950
8951         perf_pmu_enable(pmu);
8952 }
8953
8954 static int perf_event_idx_default(struct perf_event *event)
8955 {
8956         return 0;
8957 }
8958
8959 /*
8960  * Ensures all contexts with the same task_ctx_nr have the same
8961  * pmu_cpu_context too.
8962  */
8963 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8964 {
8965         struct pmu *pmu;
8966
8967         if (ctxn < 0)
8968                 return NULL;
8969
8970         list_for_each_entry(pmu, &pmus, entry) {
8971                 if (pmu->task_ctx_nr == ctxn)
8972                         return pmu->pmu_cpu_context;
8973         }
8974
8975         return NULL;
8976 }
8977
8978 static void free_pmu_context(struct pmu *pmu)
8979 {
8980         /*
8981          * Static contexts such as perf_sw_context have a global lifetime
8982          * and may be shared between different PMUs. Avoid freeing them
8983          * when a single PMU is going away.
8984          */
8985         if (pmu->task_ctx_nr > perf_invalid_context)
8986                 return;
8987
8988         mutex_lock(&pmus_lock);
8989         free_percpu(pmu->pmu_cpu_context);
8990         mutex_unlock(&pmus_lock);
8991 }
8992
8993 /*
8994  * Let userspace know that this PMU supports address range filtering:
8995  */
8996 static ssize_t nr_addr_filters_show(struct device *dev,
8997                                     struct device_attribute *attr,
8998                                     char *page)
8999 {
9000         struct pmu *pmu = dev_get_drvdata(dev);
9001
9002         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9003 }
9004 DEVICE_ATTR_RO(nr_addr_filters);
9005
9006 static struct idr pmu_idr;
9007
9008 static ssize_t
9009 type_show(struct device *dev, struct device_attribute *attr, char *page)
9010 {
9011         struct pmu *pmu = dev_get_drvdata(dev);
9012
9013         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9014 }
9015 static DEVICE_ATTR_RO(type);
9016
9017 static ssize_t
9018 perf_event_mux_interval_ms_show(struct device *dev,
9019                                 struct device_attribute *attr,
9020                                 char *page)
9021 {
9022         struct pmu *pmu = dev_get_drvdata(dev);
9023
9024         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9025 }
9026
9027 static DEFINE_MUTEX(mux_interval_mutex);
9028
9029 static ssize_t
9030 perf_event_mux_interval_ms_store(struct device *dev,
9031                                  struct device_attribute *attr,
9032                                  const char *buf, size_t count)
9033 {
9034         struct pmu *pmu = dev_get_drvdata(dev);
9035         int timer, cpu, ret;
9036
9037         ret = kstrtoint(buf, 0, &timer);
9038         if (ret)
9039                 return ret;
9040
9041         if (timer < 1)
9042                 return -EINVAL;
9043
9044         /* same value, noting to do */
9045         if (timer == pmu->hrtimer_interval_ms)
9046                 return count;
9047
9048         mutex_lock(&mux_interval_mutex);
9049         pmu->hrtimer_interval_ms = timer;
9050
9051         /* update all cpuctx for this PMU */
9052         cpus_read_lock();
9053         for_each_online_cpu(cpu) {
9054                 struct perf_cpu_context *cpuctx;
9055                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9056                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9057
9058                 cpu_function_call(cpu,
9059                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9060         }
9061         cpus_read_unlock();
9062         mutex_unlock(&mux_interval_mutex);
9063
9064         return count;
9065 }
9066 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9067
9068 static struct attribute *pmu_dev_attrs[] = {
9069         &dev_attr_type.attr,
9070         &dev_attr_perf_event_mux_interval_ms.attr,
9071         NULL,
9072 };
9073 ATTRIBUTE_GROUPS(pmu_dev);
9074
9075 static int pmu_bus_running;
9076 static struct bus_type pmu_bus = {
9077         .name           = "event_source",
9078         .dev_groups     = pmu_dev_groups,
9079 };
9080
9081 static void pmu_dev_release(struct device *dev)
9082 {
9083         kfree(dev);
9084 }
9085
9086 static int pmu_dev_alloc(struct pmu *pmu)
9087 {
9088         int ret = -ENOMEM;
9089
9090         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9091         if (!pmu->dev)
9092                 goto out;
9093
9094         pmu->dev->groups = pmu->attr_groups;
9095         device_initialize(pmu->dev);
9096         ret = dev_set_name(pmu->dev, "%s", pmu->name);
9097         if (ret)
9098                 goto free_dev;
9099
9100         dev_set_drvdata(pmu->dev, pmu);
9101         pmu->dev->bus = &pmu_bus;
9102         pmu->dev->release = pmu_dev_release;
9103         ret = device_add(pmu->dev);
9104         if (ret)
9105                 goto free_dev;
9106
9107         /* For PMUs with address filters, throw in an extra attribute: */
9108         if (pmu->nr_addr_filters)
9109                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9110
9111         if (ret)
9112                 goto del_dev;
9113
9114 out:
9115         return ret;
9116
9117 del_dev:
9118         device_del(pmu->dev);
9119
9120 free_dev:
9121         put_device(pmu->dev);
9122         goto out;
9123 }
9124
9125 static struct lock_class_key cpuctx_mutex;
9126 static struct lock_class_key cpuctx_lock;
9127
9128 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9129 {
9130         int cpu, ret;
9131
9132         mutex_lock(&pmus_lock);
9133         ret = -ENOMEM;
9134         pmu->pmu_disable_count = alloc_percpu(int);
9135         if (!pmu->pmu_disable_count)
9136                 goto unlock;
9137
9138         pmu->type = -1;
9139         if (!name)
9140                 goto skip_type;
9141         pmu->name = name;
9142
9143         if (type < 0) {
9144                 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9145                 if (type < 0) {
9146                         ret = type;
9147                         goto free_pdc;
9148                 }
9149         }
9150         pmu->type = type;
9151
9152         if (pmu_bus_running) {
9153                 ret = pmu_dev_alloc(pmu);
9154                 if (ret)
9155                         goto free_idr;
9156         }
9157
9158 skip_type:
9159         if (pmu->task_ctx_nr == perf_hw_context) {
9160                 static int hw_context_taken = 0;
9161
9162                 /*
9163                  * Other than systems with heterogeneous CPUs, it never makes
9164                  * sense for two PMUs to share perf_hw_context. PMUs which are
9165                  * uncore must use perf_invalid_context.
9166                  */
9167                 if (WARN_ON_ONCE(hw_context_taken &&
9168                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9169                         pmu->task_ctx_nr = perf_invalid_context;
9170
9171                 hw_context_taken = 1;
9172         }
9173
9174         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9175         if (pmu->pmu_cpu_context)
9176                 goto got_cpu_context;
9177
9178         ret = -ENOMEM;
9179         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9180         if (!pmu->pmu_cpu_context)
9181                 goto free_dev;
9182
9183         for_each_possible_cpu(cpu) {
9184                 struct perf_cpu_context *cpuctx;
9185
9186                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9187                 __perf_event_init_context(&cpuctx->ctx);
9188                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9189                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9190                 cpuctx->ctx.pmu = pmu;
9191                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9192
9193                 __perf_mux_hrtimer_init(cpuctx, cpu);
9194         }
9195
9196 got_cpu_context:
9197         if (!pmu->start_txn) {
9198                 if (pmu->pmu_enable) {
9199                         /*
9200                          * If we have pmu_enable/pmu_disable calls, install
9201                          * transaction stubs that use that to try and batch
9202                          * hardware accesses.
9203                          */
9204                         pmu->start_txn  = perf_pmu_start_txn;
9205                         pmu->commit_txn = perf_pmu_commit_txn;
9206                         pmu->cancel_txn = perf_pmu_cancel_txn;
9207                 } else {
9208                         pmu->start_txn  = perf_pmu_nop_txn;
9209                         pmu->commit_txn = perf_pmu_nop_int;
9210                         pmu->cancel_txn = perf_pmu_nop_void;
9211                 }
9212         }
9213
9214         if (!pmu->pmu_enable) {
9215                 pmu->pmu_enable  = perf_pmu_nop_void;
9216                 pmu->pmu_disable = perf_pmu_nop_void;
9217         }
9218
9219         if (!pmu->event_idx)
9220                 pmu->event_idx = perf_event_idx_default;
9221
9222         list_add_rcu(&pmu->entry, &pmus);
9223         atomic_set(&pmu->exclusive_cnt, 0);
9224         ret = 0;
9225 unlock:
9226         mutex_unlock(&pmus_lock);
9227
9228         return ret;
9229
9230 free_dev:
9231         device_del(pmu->dev);
9232         put_device(pmu->dev);
9233
9234 free_idr:
9235         if (pmu->type >= PERF_TYPE_MAX)
9236                 idr_remove(&pmu_idr, pmu->type);
9237
9238 free_pdc:
9239         free_percpu(pmu->pmu_disable_count);
9240         goto unlock;
9241 }
9242 EXPORT_SYMBOL_GPL(perf_pmu_register);
9243
9244 void perf_pmu_unregister(struct pmu *pmu)
9245 {
9246         int remove_device;
9247
9248         mutex_lock(&pmus_lock);
9249         remove_device = pmu_bus_running;
9250         list_del_rcu(&pmu->entry);
9251         mutex_unlock(&pmus_lock);
9252
9253         /*
9254          * We dereference the pmu list under both SRCU and regular RCU, so
9255          * synchronize against both of those.
9256          */
9257         synchronize_srcu(&pmus_srcu);
9258         synchronize_rcu();
9259
9260         free_percpu(pmu->pmu_disable_count);
9261         if (pmu->type >= PERF_TYPE_MAX)
9262                 idr_remove(&pmu_idr, pmu->type);
9263         if (remove_device) {
9264                 if (pmu->nr_addr_filters)
9265                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9266                 device_del(pmu->dev);
9267                 put_device(pmu->dev);
9268         }
9269         free_pmu_context(pmu);
9270 }
9271 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9272
9273 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9274 {
9275         struct perf_event_context *ctx = NULL;
9276         int ret;
9277
9278         if (!try_module_get(pmu->module))
9279                 return -ENODEV;
9280
9281         if (event->group_leader != event) {
9282                 /*
9283                  * This ctx->mutex can nest when we're called through
9284                  * inheritance. See the perf_event_ctx_lock_nested() comment.
9285                  */
9286                 ctx = perf_event_ctx_lock_nested(event->group_leader,
9287                                                  SINGLE_DEPTH_NESTING);
9288                 BUG_ON(!ctx);
9289         }
9290
9291         event->pmu = pmu;
9292         ret = pmu->event_init(event);
9293
9294         if (ctx)
9295                 perf_event_ctx_unlock(event->group_leader, ctx);
9296
9297         if (ret)
9298                 module_put(pmu->module);
9299
9300         return ret;
9301 }
9302
9303 static struct pmu *perf_init_event(struct perf_event *event)
9304 {
9305         struct pmu *pmu;
9306         int idx;
9307         int ret;
9308
9309         idx = srcu_read_lock(&pmus_srcu);
9310
9311         /* Try parent's PMU first: */
9312         if (event->parent && event->parent->pmu) {
9313                 pmu = event->parent->pmu;
9314                 ret = perf_try_init_event(pmu, event);
9315                 if (!ret)
9316                         goto unlock;
9317         }
9318
9319         rcu_read_lock();
9320         pmu = idr_find(&pmu_idr, event->attr.type);
9321         rcu_read_unlock();
9322         if (pmu) {
9323                 ret = perf_try_init_event(pmu, event);
9324                 if (ret)
9325                         pmu = ERR_PTR(ret);
9326                 goto unlock;
9327         }
9328
9329         list_for_each_entry_rcu(pmu, &pmus, entry) {
9330                 ret = perf_try_init_event(pmu, event);
9331                 if (!ret)
9332                         goto unlock;
9333
9334                 if (ret != -ENOENT) {
9335                         pmu = ERR_PTR(ret);
9336                         goto unlock;
9337                 }
9338         }
9339         pmu = ERR_PTR(-ENOENT);
9340 unlock:
9341         srcu_read_unlock(&pmus_srcu, idx);
9342
9343         return pmu;
9344 }
9345
9346 static void attach_sb_event(struct perf_event *event)
9347 {
9348         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9349
9350         raw_spin_lock(&pel->lock);
9351         list_add_rcu(&event->sb_list, &pel->list);
9352         raw_spin_unlock(&pel->lock);
9353 }
9354
9355 /*
9356  * We keep a list of all !task (and therefore per-cpu) events
9357  * that need to receive side-band records.
9358  *
9359  * This avoids having to scan all the various PMU per-cpu contexts
9360  * looking for them.
9361  */
9362 static void account_pmu_sb_event(struct perf_event *event)
9363 {
9364         if (is_sb_event(event))
9365                 attach_sb_event(event);
9366 }
9367
9368 static void account_event_cpu(struct perf_event *event, int cpu)
9369 {
9370         if (event->parent)
9371                 return;
9372
9373         if (is_cgroup_event(event))
9374                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9375 }
9376
9377 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
9378 static void account_freq_event_nohz(void)
9379 {
9380 #ifdef CONFIG_NO_HZ_FULL
9381         /* Lock so we don't race with concurrent unaccount */
9382         spin_lock(&nr_freq_lock);
9383         if (atomic_inc_return(&nr_freq_events) == 1)
9384                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9385         spin_unlock(&nr_freq_lock);
9386 #endif
9387 }
9388
9389 static void account_freq_event(void)
9390 {
9391         if (tick_nohz_full_enabled())
9392                 account_freq_event_nohz();
9393         else
9394                 atomic_inc(&nr_freq_events);
9395 }
9396
9397
9398 static void account_event(struct perf_event *event)
9399 {
9400         bool inc = false;
9401
9402         if (event->parent)
9403                 return;
9404
9405         if (event->attach_state & PERF_ATTACH_TASK)
9406                 inc = true;
9407         if (event->attr.mmap || event->attr.mmap_data)
9408                 atomic_inc(&nr_mmap_events);
9409         if (event->attr.comm)
9410                 atomic_inc(&nr_comm_events);
9411         if (event->attr.namespaces)
9412                 atomic_inc(&nr_namespaces_events);
9413         if (event->attr.task)
9414                 atomic_inc(&nr_task_events);
9415         if (event->attr.freq)
9416                 account_freq_event();
9417         if (event->attr.context_switch) {
9418                 atomic_inc(&nr_switch_events);
9419                 inc = true;
9420         }
9421         if (has_branch_stack(event))
9422                 inc = true;
9423         if (is_cgroup_event(event))
9424                 inc = true;
9425
9426         if (inc) {
9427                 if (atomic_inc_not_zero(&perf_sched_count))
9428                         goto enabled;
9429
9430                 mutex_lock(&perf_sched_mutex);
9431                 if (!atomic_read(&perf_sched_count)) {
9432                         static_branch_enable(&perf_sched_events);
9433                         /*
9434                          * Guarantee that all CPUs observe they key change and
9435                          * call the perf scheduling hooks before proceeding to
9436                          * install events that need them.
9437                          */
9438                         synchronize_sched();
9439                 }
9440                 /*
9441                  * Now that we have waited for the sync_sched(), allow further
9442                  * increments to by-pass the mutex.
9443                  */
9444                 atomic_inc(&perf_sched_count);
9445                 mutex_unlock(&perf_sched_mutex);
9446         }
9447 enabled:
9448
9449         account_event_cpu(event, event->cpu);
9450
9451         account_pmu_sb_event(event);
9452 }
9453
9454 /*
9455  * Allocate and initialize a event structure
9456  */
9457 static struct perf_event *
9458 perf_event_alloc(struct perf_event_attr *attr, int cpu,
9459                  struct task_struct *task,
9460                  struct perf_event *group_leader,
9461                  struct perf_event *parent_event,
9462                  perf_overflow_handler_t overflow_handler,
9463                  void *context, int cgroup_fd)
9464 {
9465         struct pmu *pmu;
9466         struct perf_event *event;
9467         struct hw_perf_event *hwc;
9468         long err = -EINVAL;
9469
9470         if ((unsigned)cpu >= nr_cpu_ids) {
9471                 if (!task || cpu != -1)
9472                         return ERR_PTR(-EINVAL);
9473         }
9474
9475         event = kzalloc(sizeof(*event), GFP_KERNEL);
9476         if (!event)
9477                 return ERR_PTR(-ENOMEM);
9478
9479         /*
9480          * Single events are their own group leaders, with an
9481          * empty sibling list:
9482          */
9483         if (!group_leader)
9484                 group_leader = event;
9485
9486         mutex_init(&event->child_mutex);
9487         INIT_LIST_HEAD(&event->child_list);
9488
9489         INIT_LIST_HEAD(&event->group_entry);
9490         INIT_LIST_HEAD(&event->event_entry);
9491         INIT_LIST_HEAD(&event->sibling_list);
9492         INIT_LIST_HEAD(&event->rb_entry);
9493         INIT_LIST_HEAD(&event->active_entry);
9494         INIT_LIST_HEAD(&event->addr_filters.list);
9495         INIT_HLIST_NODE(&event->hlist_entry);
9496
9497
9498         init_waitqueue_head(&event->waitq);
9499         init_irq_work(&event->pending, perf_pending_event);
9500
9501         mutex_init(&event->mmap_mutex);
9502         raw_spin_lock_init(&event->addr_filters.lock);
9503
9504         atomic_long_set(&event->refcount, 1);
9505         event->cpu              = cpu;
9506         event->attr             = *attr;
9507         event->group_leader     = group_leader;
9508         event->pmu              = NULL;
9509         event->oncpu            = -1;
9510
9511         event->parent           = parent_event;
9512
9513         event->ns               = get_pid_ns(task_active_pid_ns(current));
9514         event->id               = atomic64_inc_return(&perf_event_id);
9515
9516         event->state            = PERF_EVENT_STATE_INACTIVE;
9517
9518         if (task) {
9519                 event->attach_state = PERF_ATTACH_TASK;
9520                 /*
9521                  * XXX pmu::event_init needs to know what task to account to
9522                  * and we cannot use the ctx information because we need the
9523                  * pmu before we get a ctx.
9524                  */
9525                 get_task_struct(task);
9526                 event->hw.target = task;
9527         }
9528
9529         event->clock = &local_clock;
9530         if (parent_event)
9531                 event->clock = parent_event->clock;
9532
9533         if (!overflow_handler && parent_event) {
9534                 overflow_handler = parent_event->overflow_handler;
9535                 context = parent_event->overflow_handler_context;
9536 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9537                 if (overflow_handler == bpf_overflow_handler) {
9538                         struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9539
9540                         if (IS_ERR(prog)) {
9541                                 err = PTR_ERR(prog);
9542                                 goto err_ns;
9543                         }
9544                         event->prog = prog;
9545                         event->orig_overflow_handler =
9546                                 parent_event->orig_overflow_handler;
9547                 }
9548 #endif
9549         }
9550
9551         if (overflow_handler) {
9552                 event->overflow_handler = overflow_handler;
9553                 event->overflow_handler_context = context;
9554         } else if (is_write_backward(event)){
9555                 event->overflow_handler = perf_event_output_backward;
9556                 event->overflow_handler_context = NULL;
9557         } else {
9558                 event->overflow_handler = perf_event_output_forward;
9559                 event->overflow_handler_context = NULL;
9560         }
9561
9562         perf_event__state_init(event);
9563
9564         pmu = NULL;
9565
9566         hwc = &event->hw;
9567         hwc->sample_period = attr->sample_period;
9568         if (attr->freq && attr->sample_freq)
9569                 hwc->sample_period = 1;
9570         hwc->last_period = hwc->sample_period;
9571
9572         local64_set(&hwc->period_left, hwc->sample_period);
9573
9574         /*
9575          * We currently do not support PERF_SAMPLE_READ on inherited events.
9576          * See perf_output_read().
9577          */
9578         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
9579                 goto err_ns;
9580
9581         if (!has_branch_stack(event))
9582                 event->attr.branch_sample_type = 0;
9583
9584         if (cgroup_fd != -1) {
9585                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9586                 if (err)
9587                         goto err_ns;
9588         }
9589
9590         pmu = perf_init_event(event);
9591         if (IS_ERR(pmu)) {
9592                 err = PTR_ERR(pmu);
9593                 goto err_ns;
9594         }
9595
9596         err = exclusive_event_init(event);
9597         if (err)
9598                 goto err_pmu;
9599
9600         if (has_addr_filter(event)) {
9601                 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9602                                                    sizeof(unsigned long),
9603                                                    GFP_KERNEL);
9604                 if (!event->addr_filters_offs) {
9605                         err = -ENOMEM;
9606                         goto err_per_task;
9607                 }
9608
9609                 /* force hw sync on the address filters */
9610                 event->addr_filters_gen = 1;
9611         }
9612
9613         if (!event->parent) {
9614                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
9615                         err = get_callchain_buffers(attr->sample_max_stack);
9616                         if (err)
9617                                 goto err_addr_filters;
9618                 }
9619         }
9620
9621         /* symmetric to unaccount_event() in _free_event() */
9622         account_event(event);
9623
9624         return event;
9625
9626 err_addr_filters:
9627         kfree(event->addr_filters_offs);
9628
9629 err_per_task:
9630         exclusive_event_destroy(event);
9631
9632 err_pmu:
9633         if (event->destroy)
9634                 event->destroy(event);
9635         module_put(pmu->module);
9636 err_ns:
9637         if (is_cgroup_event(event))
9638                 perf_detach_cgroup(event);
9639         if (event->ns)
9640                 put_pid_ns(event->ns);
9641         if (event->hw.target)
9642                 put_task_struct(event->hw.target);
9643         kfree(event);
9644
9645         return ERR_PTR(err);
9646 }
9647
9648 static int perf_copy_attr(struct perf_event_attr __user *uattr,
9649                           struct perf_event_attr *attr)
9650 {
9651         u32 size;
9652         int ret;
9653
9654         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9655                 return -EFAULT;
9656
9657         /*
9658          * zero the full structure, so that a short copy will be nice.
9659          */
9660         memset(attr, 0, sizeof(*attr));
9661
9662         ret = get_user(size, &uattr->size);
9663         if (ret)
9664                 return ret;
9665
9666         if (size > PAGE_SIZE)   /* silly large */
9667                 goto err_size;
9668
9669         if (!size)              /* abi compat */
9670                 size = PERF_ATTR_SIZE_VER0;
9671
9672         if (size < PERF_ATTR_SIZE_VER0)
9673                 goto err_size;
9674
9675         /*
9676          * If we're handed a bigger struct than we know of,
9677          * ensure all the unknown bits are 0 - i.e. new
9678          * user-space does not rely on any kernel feature
9679          * extensions we dont know about yet.
9680          */
9681         if (size > sizeof(*attr)) {
9682                 unsigned char __user *addr;
9683                 unsigned char __user *end;
9684                 unsigned char val;
9685
9686                 addr = (void __user *)uattr + sizeof(*attr);
9687                 end  = (void __user *)uattr + size;
9688
9689                 for (; addr < end; addr++) {
9690                         ret = get_user(val, addr);
9691                         if (ret)
9692                                 return ret;
9693                         if (val)
9694                                 goto err_size;
9695                 }
9696                 size = sizeof(*attr);
9697         }
9698
9699         ret = copy_from_user(attr, uattr, size);
9700         if (ret)
9701                 return -EFAULT;
9702
9703         attr->size = size;
9704
9705         if (attr->__reserved_1)
9706                 return -EINVAL;
9707
9708         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9709                 return -EINVAL;
9710
9711         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9712                 return -EINVAL;
9713
9714         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9715                 u64 mask = attr->branch_sample_type;
9716
9717                 /* only using defined bits */
9718                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9719                         return -EINVAL;
9720
9721                 /* at least one branch bit must be set */
9722                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9723                         return -EINVAL;
9724
9725                 /* propagate priv level, when not set for branch */
9726                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9727
9728                         /* exclude_kernel checked on syscall entry */
9729                         if (!attr->exclude_kernel)
9730                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9731
9732                         if (!attr->exclude_user)
9733                                 mask |= PERF_SAMPLE_BRANCH_USER;
9734
9735                         if (!attr->exclude_hv)
9736                                 mask |= PERF_SAMPLE_BRANCH_HV;
9737                         /*
9738                          * adjust user setting (for HW filter setup)
9739                          */
9740                         attr->branch_sample_type = mask;
9741                 }
9742                 /* privileged levels capture (kernel, hv): check permissions */
9743                 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
9744                     && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9745                         return -EACCES;
9746         }
9747
9748         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
9749                 ret = perf_reg_validate(attr->sample_regs_user);
9750                 if (ret)
9751                         return ret;
9752         }
9753
9754         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9755                 if (!arch_perf_have_user_stack_dump())
9756                         return -ENOSYS;
9757
9758                 /*
9759                  * We have __u32 type for the size, but so far
9760                  * we can only use __u16 as maximum due to the
9761                  * __u16 sample size limit.
9762                  */
9763                 if (attr->sample_stack_user >= USHRT_MAX)
9764                         return -EINVAL;
9765                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9766                         return -EINVAL;
9767         }
9768
9769         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9770                 ret = perf_reg_validate(attr->sample_regs_intr);
9771 out:
9772         return ret;
9773
9774 err_size:
9775         put_user(sizeof(*attr), &uattr->size);
9776         ret = -E2BIG;
9777         goto out;
9778 }
9779
9780 static int
9781 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
9782 {
9783         struct ring_buffer *rb = NULL;
9784         int ret = -EINVAL;
9785
9786         if (!output_event)
9787                 goto set;
9788
9789         /* don't allow circular references */
9790         if (event == output_event)
9791                 goto out;
9792
9793         /*
9794          * Don't allow cross-cpu buffers
9795          */
9796         if (output_event->cpu != event->cpu)
9797                 goto out;
9798
9799         /*
9800          * If its not a per-cpu rb, it must be the same task.
9801          */
9802         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
9803                 goto out;
9804
9805         /*
9806          * Mixing clocks in the same buffer is trouble you don't need.
9807          */
9808         if (output_event->clock != event->clock)
9809                 goto out;
9810
9811         /*
9812          * Either writing ring buffer from beginning or from end.
9813          * Mixing is not allowed.
9814          */
9815         if (is_write_backward(output_event) != is_write_backward(event))
9816                 goto out;
9817
9818         /*
9819          * If both events generate aux data, they must be on the same PMU
9820          */
9821         if (has_aux(event) && has_aux(output_event) &&
9822             event->pmu != output_event->pmu)
9823                 goto out;
9824
9825 set:
9826         mutex_lock(&event->mmap_mutex);
9827         /* Can't redirect output if we've got an active mmap() */
9828         if (atomic_read(&event->mmap_count))
9829                 goto unlock;
9830
9831         if (output_event) {
9832                 /* get the rb we want to redirect to */
9833                 rb = ring_buffer_get(output_event);
9834                 if (!rb)
9835                         goto unlock;
9836         }
9837
9838         ring_buffer_attach(event, rb);
9839
9840         ret = 0;
9841 unlock:
9842         mutex_unlock(&event->mmap_mutex);
9843
9844 out:
9845         return ret;
9846 }
9847
9848 static void mutex_lock_double(struct mutex *a, struct mutex *b)
9849 {
9850         if (b < a)
9851                 swap(a, b);
9852
9853         mutex_lock(a);
9854         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9855 }
9856
9857 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9858 {
9859         bool nmi_safe = false;
9860
9861         switch (clk_id) {
9862         case CLOCK_MONOTONIC:
9863                 event->clock = &ktime_get_mono_fast_ns;
9864                 nmi_safe = true;
9865                 break;
9866
9867         case CLOCK_MONOTONIC_RAW:
9868                 event->clock = &ktime_get_raw_fast_ns;
9869                 nmi_safe = true;
9870                 break;
9871
9872         case CLOCK_REALTIME:
9873                 event->clock = &ktime_get_real_ns;
9874                 break;
9875
9876         case CLOCK_BOOTTIME:
9877                 event->clock = &ktime_get_boot_ns;
9878                 break;
9879
9880         case CLOCK_TAI:
9881                 event->clock = &ktime_get_tai_ns;
9882                 break;
9883
9884         default:
9885                 return -EINVAL;
9886         }
9887
9888         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9889                 return -EINVAL;
9890
9891         return 0;
9892 }
9893
9894 /*
9895  * Variation on perf_event_ctx_lock_nested(), except we take two context
9896  * mutexes.
9897  */
9898 static struct perf_event_context *
9899 __perf_event_ctx_lock_double(struct perf_event *group_leader,
9900                              struct perf_event_context *ctx)
9901 {
9902         struct perf_event_context *gctx;
9903
9904 again:
9905         rcu_read_lock();
9906         gctx = READ_ONCE(group_leader->ctx);
9907         if (!atomic_inc_not_zero(&gctx->refcount)) {
9908                 rcu_read_unlock();
9909                 goto again;
9910         }
9911         rcu_read_unlock();
9912
9913         mutex_lock_double(&gctx->mutex, &ctx->mutex);
9914
9915         if (group_leader->ctx != gctx) {
9916                 mutex_unlock(&ctx->mutex);
9917                 mutex_unlock(&gctx->mutex);
9918                 put_ctx(gctx);
9919                 goto again;
9920         }
9921
9922         return gctx;
9923 }
9924
9925 /**
9926  * sys_perf_event_open - open a performance event, associate it to a task/cpu
9927  *
9928  * @attr_uptr:  event_id type attributes for monitoring/sampling
9929  * @pid:                target pid
9930  * @cpu:                target cpu
9931  * @group_fd:           group leader event fd
9932  */
9933 SYSCALL_DEFINE5(perf_event_open,
9934                 struct perf_event_attr __user *, attr_uptr,
9935                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
9936 {
9937         struct perf_event *group_leader = NULL, *output_event = NULL;
9938         struct perf_event *event, *sibling;
9939         struct perf_event_attr attr;
9940         struct perf_event_context *ctx, *uninitialized_var(gctx);
9941         struct file *event_file = NULL;
9942         struct fd group = {NULL, 0};
9943         struct task_struct *task = NULL;
9944         struct pmu *pmu;
9945         int event_fd;
9946         int move_group = 0;
9947         int err;
9948         int f_flags = O_RDWR;
9949         int cgroup_fd = -1;
9950
9951         /* for future expandability... */
9952         if (flags & ~PERF_FLAG_ALL)
9953                 return -EINVAL;
9954
9955         err = perf_copy_attr(attr_uptr, &attr);
9956         if (err)
9957                 return err;
9958
9959         if (!attr.exclude_kernel) {
9960                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9961                         return -EACCES;
9962         }
9963
9964         if (attr.namespaces) {
9965                 if (!capable(CAP_SYS_ADMIN))
9966                         return -EACCES;
9967         }
9968
9969         if (attr.freq) {
9970                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
9971                         return -EINVAL;
9972         } else {
9973                 if (attr.sample_period & (1ULL << 63))
9974                         return -EINVAL;
9975         }
9976
9977         /* Only privileged users can get physical addresses */
9978         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
9979             perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9980                 return -EACCES;
9981
9982         if (!attr.sample_max_stack)
9983                 attr.sample_max_stack = sysctl_perf_event_max_stack;
9984
9985         /*
9986          * In cgroup mode, the pid argument is used to pass the fd
9987          * opened to the cgroup directory in cgroupfs. The cpu argument
9988          * designates the cpu on which to monitor threads from that
9989          * cgroup.
9990          */
9991         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
9992                 return -EINVAL;
9993
9994         if (flags & PERF_FLAG_FD_CLOEXEC)
9995                 f_flags |= O_CLOEXEC;
9996
9997         event_fd = get_unused_fd_flags(f_flags);
9998         if (event_fd < 0)
9999                 return event_fd;
10000
10001         if (group_fd != -1) {
10002                 err = perf_fget_light(group_fd, &group);
10003                 if (err)
10004                         goto err_fd;
10005                 group_leader = group.file->private_data;
10006                 if (flags & PERF_FLAG_FD_OUTPUT)
10007                         output_event = group_leader;
10008                 if (flags & PERF_FLAG_FD_NO_GROUP)
10009                         group_leader = NULL;
10010         }
10011
10012         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10013                 task = find_lively_task_by_vpid(pid);
10014                 if (IS_ERR(task)) {
10015                         err = PTR_ERR(task);
10016                         goto err_group_fd;
10017                 }
10018         }
10019
10020         if (task && group_leader &&
10021             group_leader->attr.inherit != attr.inherit) {
10022                 err = -EINVAL;
10023                 goto err_task;
10024         }
10025
10026         if (task) {
10027                 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10028                 if (err)
10029                         goto err_task;
10030
10031                 /*
10032                  * Reuse ptrace permission checks for now.
10033                  *
10034                  * We must hold cred_guard_mutex across this and any potential
10035                  * perf_install_in_context() call for this new event to
10036                  * serialize against exec() altering our credentials (and the
10037                  * perf_event_exit_task() that could imply).
10038                  */
10039                 err = -EACCES;
10040                 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10041                         goto err_cred;
10042         }
10043
10044         if (flags & PERF_FLAG_PID_CGROUP)
10045                 cgroup_fd = pid;
10046
10047         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10048                                  NULL, NULL, cgroup_fd);
10049         if (IS_ERR(event)) {
10050                 err = PTR_ERR(event);
10051                 goto err_cred;
10052         }
10053
10054         if (is_sampling_event(event)) {
10055                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10056                         err = -EOPNOTSUPP;
10057                         goto err_alloc;
10058                 }
10059         }
10060
10061         /*
10062          * Special case software events and allow them to be part of
10063          * any hardware group.
10064          */
10065         pmu = event->pmu;
10066
10067         if (attr.use_clockid) {
10068                 err = perf_event_set_clock(event, attr.clockid);
10069                 if (err)
10070                         goto err_alloc;
10071         }
10072
10073         if (pmu->task_ctx_nr == perf_sw_context)
10074                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10075
10076         if (group_leader &&
10077             (is_software_event(event) != is_software_event(group_leader))) {
10078                 if (is_software_event(event)) {
10079                         /*
10080                          * If event and group_leader are not both a software
10081                          * event, and event is, then group leader is not.
10082                          *
10083                          * Allow the addition of software events to !software
10084                          * groups, this is safe because software events never
10085                          * fail to schedule.
10086                          */
10087                         pmu = group_leader->pmu;
10088                 } else if (is_software_event(group_leader) &&
10089                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10090                         /*
10091                          * In case the group is a pure software group, and we
10092                          * try to add a hardware event, move the whole group to
10093                          * the hardware context.
10094                          */
10095                         move_group = 1;
10096                 }
10097         }
10098
10099         /*
10100          * Get the target context (task or percpu):
10101          */
10102         ctx = find_get_context(pmu, task, event);
10103         if (IS_ERR(ctx)) {
10104                 err = PTR_ERR(ctx);
10105                 goto err_alloc;
10106         }
10107
10108         if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
10109                 err = -EBUSY;
10110                 goto err_context;
10111         }
10112
10113         /*
10114          * Look up the group leader (we will attach this event to it):
10115          */
10116         if (group_leader) {
10117                 err = -EINVAL;
10118
10119                 /*
10120                  * Do not allow a recursive hierarchy (this new sibling
10121                  * becoming part of another group-sibling):
10122                  */
10123                 if (group_leader->group_leader != group_leader)
10124                         goto err_context;
10125
10126                 /* All events in a group should have the same clock */
10127                 if (group_leader->clock != event->clock)
10128                         goto err_context;
10129
10130                 /*
10131                  * Make sure we're both events for the same CPU;
10132                  * grouping events for different CPUs is broken; since
10133                  * you can never concurrently schedule them anyhow.
10134                  */
10135                 if (group_leader->cpu != event->cpu)
10136                         goto err_context;
10137
10138                 /*
10139                  * Make sure we're both on the same task, or both
10140                  * per-CPU events.
10141                  */
10142                 if (group_leader->ctx->task != ctx->task)
10143                         goto err_context;
10144
10145                 /*
10146                  * Do not allow to attach to a group in a different task
10147                  * or CPU context. If we're moving SW events, we'll fix
10148                  * this up later, so allow that.
10149                  */
10150                 if (!move_group && group_leader->ctx != ctx)
10151                         goto err_context;
10152
10153                 /*
10154                  * Only a group leader can be exclusive or pinned
10155                  */
10156                 if (attr.exclusive || attr.pinned)
10157                         goto err_context;
10158         }
10159
10160         if (output_event) {
10161                 err = perf_event_set_output(event, output_event);
10162                 if (err)
10163                         goto err_context;
10164         }
10165
10166         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10167                                         f_flags);
10168         if (IS_ERR(event_file)) {
10169                 err = PTR_ERR(event_file);
10170                 event_file = NULL;
10171                 goto err_context;
10172         }
10173
10174         if (move_group) {
10175                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10176
10177                 if (gctx->task == TASK_TOMBSTONE) {
10178                         err = -ESRCH;
10179                         goto err_locked;
10180                 }
10181
10182                 /*
10183                  * Check if we raced against another sys_perf_event_open() call
10184                  * moving the software group underneath us.
10185                  */
10186                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10187                         /*
10188                          * If someone moved the group out from under us, check
10189                          * if this new event wound up on the same ctx, if so
10190                          * its the regular !move_group case, otherwise fail.
10191                          */
10192                         if (gctx != ctx) {
10193                                 err = -EINVAL;
10194                                 goto err_locked;
10195                         } else {
10196                                 perf_event_ctx_unlock(group_leader, gctx);
10197                                 move_group = 0;
10198                         }
10199                 }
10200         } else {
10201                 mutex_lock(&ctx->mutex);
10202         }
10203
10204         if (ctx->task == TASK_TOMBSTONE) {
10205                 err = -ESRCH;
10206                 goto err_locked;
10207         }
10208
10209         if (!perf_event_validate_size(event)) {
10210                 err = -E2BIG;
10211                 goto err_locked;
10212         }
10213
10214         if (!task) {
10215                 /*
10216                  * Check if the @cpu we're creating an event for is online.
10217                  *
10218                  * We use the perf_cpu_context::ctx::mutex to serialize against
10219                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10220                  */
10221                 struct perf_cpu_context *cpuctx =
10222                         container_of(ctx, struct perf_cpu_context, ctx);
10223
10224                 if (!cpuctx->online) {
10225                         err = -ENODEV;
10226                         goto err_locked;
10227                 }
10228         }
10229
10230
10231         /*
10232          * Must be under the same ctx::mutex as perf_install_in_context(),
10233          * because we need to serialize with concurrent event creation.
10234          */
10235         if (!exclusive_event_installable(event, ctx)) {
10236                 /* exclusive and group stuff are assumed mutually exclusive */
10237                 WARN_ON_ONCE(move_group);
10238
10239                 err = -EBUSY;
10240                 goto err_locked;
10241         }
10242
10243         WARN_ON_ONCE(ctx->parent_ctx);
10244
10245         /*
10246          * This is the point on no return; we cannot fail hereafter. This is
10247          * where we start modifying current state.
10248          */
10249
10250         if (move_group) {
10251                 /*
10252                  * See perf_event_ctx_lock() for comments on the details
10253                  * of swizzling perf_event::ctx.
10254                  */
10255                 perf_remove_from_context(group_leader, 0);
10256                 put_ctx(gctx);
10257
10258                 list_for_each_entry(sibling, &group_leader->sibling_list,
10259                                     group_entry) {
10260                         perf_remove_from_context(sibling, 0);
10261                         put_ctx(gctx);
10262                 }
10263
10264                 /*
10265                  * Wait for everybody to stop referencing the events through
10266                  * the old lists, before installing it on new lists.
10267                  */
10268                 synchronize_rcu();
10269
10270                 /*
10271                  * Install the group siblings before the group leader.
10272                  *
10273                  * Because a group leader will try and install the entire group
10274                  * (through the sibling list, which is still in-tact), we can
10275                  * end up with siblings installed in the wrong context.
10276                  *
10277                  * By installing siblings first we NO-OP because they're not
10278                  * reachable through the group lists.
10279                  */
10280                 list_for_each_entry(sibling, &group_leader->sibling_list,
10281                                     group_entry) {
10282                         perf_event__state_init(sibling);
10283                         perf_install_in_context(ctx, sibling, sibling->cpu);
10284                         get_ctx(ctx);
10285                 }
10286
10287                 /*
10288                  * Removing from the context ends up with disabled
10289                  * event. What we want here is event in the initial
10290                  * startup state, ready to be add into new context.
10291                  */
10292                 perf_event__state_init(group_leader);
10293                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10294                 get_ctx(ctx);
10295         }
10296
10297         /*
10298          * Precalculate sample_data sizes; do while holding ctx::mutex such
10299          * that we're serialized against further additions and before
10300          * perf_install_in_context() which is the point the event is active and
10301          * can use these values.
10302          */
10303         perf_event__header_size(event);
10304         perf_event__id_header_size(event);
10305
10306         event->owner = current;
10307
10308         perf_install_in_context(ctx, event, event->cpu);
10309         perf_unpin_context(ctx);
10310
10311         if (move_group)
10312                 perf_event_ctx_unlock(group_leader, gctx);
10313         mutex_unlock(&ctx->mutex);
10314
10315         if (task) {
10316                 mutex_unlock(&task->signal->cred_guard_mutex);
10317                 put_task_struct(task);
10318         }
10319
10320         mutex_lock(&current->perf_event_mutex);
10321         list_add_tail(&event->owner_entry, &current->perf_event_list);
10322         mutex_unlock(&current->perf_event_mutex);
10323
10324         /*
10325          * Drop the reference on the group_event after placing the
10326          * new event on the sibling_list. This ensures destruction
10327          * of the group leader will find the pointer to itself in
10328          * perf_group_detach().
10329          */
10330         fdput(group);
10331         fd_install(event_fd, event_file);
10332         return event_fd;
10333
10334 err_locked:
10335         if (move_group)
10336                 perf_event_ctx_unlock(group_leader, gctx);
10337         mutex_unlock(&ctx->mutex);
10338 /* err_file: */
10339         fput(event_file);
10340 err_context:
10341         perf_unpin_context(ctx);
10342         put_ctx(ctx);
10343 err_alloc:
10344         /*
10345          * If event_file is set, the fput() above will have called ->release()
10346          * and that will take care of freeing the event.
10347          */
10348         if (!event_file)
10349                 free_event(event);
10350 err_cred:
10351         if (task)
10352                 mutex_unlock(&task->signal->cred_guard_mutex);
10353 err_task:
10354         if (task)
10355                 put_task_struct(task);
10356 err_group_fd:
10357         fdput(group);
10358 err_fd:
10359         put_unused_fd(event_fd);
10360         return err;
10361 }
10362
10363 /**
10364  * perf_event_create_kernel_counter
10365  *
10366  * @attr: attributes of the counter to create
10367  * @cpu: cpu in which the counter is bound
10368  * @task: task to profile (NULL for percpu)
10369  */
10370 struct perf_event *
10371 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10372                                  struct task_struct *task,
10373                                  perf_overflow_handler_t overflow_handler,
10374                                  void *context)
10375 {
10376         struct perf_event_context *ctx;
10377         struct perf_event *event;
10378         int err;
10379
10380         /*
10381          * Get the target context (task or percpu):
10382          */
10383
10384         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10385                                  overflow_handler, context, -1);
10386         if (IS_ERR(event)) {
10387                 err = PTR_ERR(event);
10388                 goto err;
10389         }
10390
10391         /* Mark owner so we could distinguish it from user events. */
10392         event->owner = TASK_TOMBSTONE;
10393
10394         ctx = find_get_context(event->pmu, task, event);
10395         if (IS_ERR(ctx)) {
10396                 err = PTR_ERR(ctx);
10397                 goto err_free;
10398         }
10399
10400         WARN_ON_ONCE(ctx->parent_ctx);
10401         mutex_lock(&ctx->mutex);
10402         if (ctx->task == TASK_TOMBSTONE) {
10403                 err = -ESRCH;
10404                 goto err_unlock;
10405         }
10406
10407         if (!task) {
10408                 /*
10409                  * Check if the @cpu we're creating an event for is online.
10410                  *
10411                  * We use the perf_cpu_context::ctx::mutex to serialize against
10412                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10413                  */
10414                 struct perf_cpu_context *cpuctx =
10415                         container_of(ctx, struct perf_cpu_context, ctx);
10416                 if (!cpuctx->online) {
10417                         err = -ENODEV;
10418                         goto err_unlock;
10419                 }
10420         }
10421
10422         if (!exclusive_event_installable(event, ctx)) {
10423                 err = -EBUSY;
10424                 goto err_unlock;
10425         }
10426
10427         perf_install_in_context(ctx, event, cpu);
10428         perf_unpin_context(ctx);
10429         mutex_unlock(&ctx->mutex);
10430
10431         return event;
10432
10433 err_unlock:
10434         mutex_unlock(&ctx->mutex);
10435         perf_unpin_context(ctx);
10436         put_ctx(ctx);
10437 err_free:
10438         free_event(event);
10439 err:
10440         return ERR_PTR(err);
10441 }
10442 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10443
10444 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10445 {
10446         struct perf_event_context *src_ctx;
10447         struct perf_event_context *dst_ctx;
10448         struct perf_event *event, *tmp;
10449         LIST_HEAD(events);
10450
10451         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10452         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10453
10454         /*
10455          * See perf_event_ctx_lock() for comments on the details
10456          * of swizzling perf_event::ctx.
10457          */
10458         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10459         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10460                                  event_entry) {
10461                 perf_remove_from_context(event, 0);
10462                 unaccount_event_cpu(event, src_cpu);
10463                 put_ctx(src_ctx);
10464                 list_add(&event->migrate_entry, &events);
10465         }
10466
10467         /*
10468          * Wait for the events to quiesce before re-instating them.
10469          */
10470         synchronize_rcu();
10471
10472         /*
10473          * Re-instate events in 2 passes.
10474          *
10475          * Skip over group leaders and only install siblings on this first
10476          * pass, siblings will not get enabled without a leader, however a
10477          * leader will enable its siblings, even if those are still on the old
10478          * context.
10479          */
10480         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10481                 if (event->group_leader == event)
10482                         continue;
10483
10484                 list_del(&event->migrate_entry);
10485                 if (event->state >= PERF_EVENT_STATE_OFF)
10486                         event->state = PERF_EVENT_STATE_INACTIVE;
10487                 account_event_cpu(event, dst_cpu);
10488                 perf_install_in_context(dst_ctx, event, dst_cpu);
10489                 get_ctx(dst_ctx);
10490         }
10491
10492         /*
10493          * Once all the siblings are setup properly, install the group leaders
10494          * to make it go.
10495          */
10496         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10497                 list_del(&event->migrate_entry);
10498                 if (event->state >= PERF_EVENT_STATE_OFF)
10499                         event->state = PERF_EVENT_STATE_INACTIVE;
10500                 account_event_cpu(event, dst_cpu);
10501                 perf_install_in_context(dst_ctx, event, dst_cpu);
10502                 get_ctx(dst_ctx);
10503         }
10504         mutex_unlock(&dst_ctx->mutex);
10505         mutex_unlock(&src_ctx->mutex);
10506 }
10507 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10508
10509 static void sync_child_event(struct perf_event *child_event,
10510                                struct task_struct *child)
10511 {
10512         struct perf_event *parent_event = child_event->parent;
10513         u64 child_val;
10514
10515         if (child_event->attr.inherit_stat)
10516                 perf_event_read_event(child_event, child);
10517
10518         child_val = perf_event_count(child_event);
10519
10520         /*
10521          * Add back the child's count to the parent's count:
10522          */
10523         atomic64_add(child_val, &parent_event->child_count);
10524         atomic64_add(child_event->total_time_enabled,
10525                      &parent_event->child_total_time_enabled);
10526         atomic64_add(child_event->total_time_running,
10527                      &parent_event->child_total_time_running);
10528 }
10529
10530 static void
10531 perf_event_exit_event(struct perf_event *child_event,
10532                       struct perf_event_context *child_ctx,
10533                       struct task_struct *child)
10534 {
10535         struct perf_event *parent_event = child_event->parent;
10536
10537         /*
10538          * Do not destroy the 'original' grouping; because of the context
10539          * switch optimization the original events could've ended up in a
10540          * random child task.
10541          *
10542          * If we were to destroy the original group, all group related
10543          * operations would cease to function properly after this random
10544          * child dies.
10545          *
10546          * Do destroy all inherited groups, we don't care about those
10547          * and being thorough is better.
10548          */
10549         raw_spin_lock_irq(&child_ctx->lock);
10550         WARN_ON_ONCE(child_ctx->is_active);
10551
10552         if (parent_event)
10553                 perf_group_detach(child_event);
10554         list_del_event(child_event, child_ctx);
10555         child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
10556         raw_spin_unlock_irq(&child_ctx->lock);
10557
10558         /*
10559          * Parent events are governed by their filedesc, retain them.
10560          */
10561         if (!parent_event) {
10562                 perf_event_wakeup(child_event);
10563                 return;
10564         }
10565         /*
10566          * Child events can be cleaned up.
10567          */
10568
10569         sync_child_event(child_event, child);
10570
10571         /*
10572          * Remove this event from the parent's list
10573          */
10574         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10575         mutex_lock(&parent_event->child_mutex);
10576         list_del_init(&child_event->child_list);
10577         mutex_unlock(&parent_event->child_mutex);
10578
10579         /*
10580          * Kick perf_poll() for is_event_hup().
10581          */
10582         perf_event_wakeup(parent_event);
10583         free_event(child_event);
10584         put_event(parent_event);
10585 }
10586
10587 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
10588 {
10589         struct perf_event_context *child_ctx, *clone_ctx = NULL;
10590         struct perf_event *child_event, *next;
10591
10592         WARN_ON_ONCE(child != current);
10593
10594         child_ctx = perf_pin_task_context(child, ctxn);
10595         if (!child_ctx)
10596                 return;
10597
10598         /*
10599          * In order to reduce the amount of tricky in ctx tear-down, we hold
10600          * ctx::mutex over the entire thing. This serializes against almost
10601          * everything that wants to access the ctx.
10602          *
10603          * The exception is sys_perf_event_open() /
10604          * perf_event_create_kernel_count() which does find_get_context()
10605          * without ctx::mutex (it cannot because of the move_group double mutex
10606          * lock thing). See the comments in perf_install_in_context().
10607          */
10608         mutex_lock(&child_ctx->mutex);
10609
10610         /*
10611          * In a single ctx::lock section, de-schedule the events and detach the
10612          * context from the task such that we cannot ever get it scheduled back
10613          * in.
10614          */
10615         raw_spin_lock_irq(&child_ctx->lock);
10616         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
10617
10618         /*
10619          * Now that the context is inactive, destroy the task <-> ctx relation
10620          * and mark the context dead.
10621          */
10622         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10623         put_ctx(child_ctx); /* cannot be last */
10624         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10625         put_task_struct(current); /* cannot be last */
10626
10627         clone_ctx = unclone_ctx(child_ctx);
10628         raw_spin_unlock_irq(&child_ctx->lock);
10629
10630         if (clone_ctx)
10631                 put_ctx(clone_ctx);
10632
10633         /*
10634          * Report the task dead after unscheduling the events so that we
10635          * won't get any samples after PERF_RECORD_EXIT. We can however still
10636          * get a few PERF_RECORD_READ events.
10637          */
10638         perf_event_task(child, child_ctx, 0);
10639
10640         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10641                 perf_event_exit_event(child_event, child_ctx, child);
10642
10643         mutex_unlock(&child_ctx->mutex);
10644
10645         put_ctx(child_ctx);
10646 }
10647
10648 /*
10649  * When a child task exits, feed back event values to parent events.
10650  *
10651  * Can be called with cred_guard_mutex held when called from
10652  * install_exec_creds().
10653  */
10654 void perf_event_exit_task(struct task_struct *child)
10655 {
10656         struct perf_event *event, *tmp;
10657         int ctxn;
10658
10659         mutex_lock(&child->perf_event_mutex);
10660         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10661                                  owner_entry) {
10662                 list_del_init(&event->owner_entry);
10663
10664                 /*
10665                  * Ensure the list deletion is visible before we clear
10666                  * the owner, closes a race against perf_release() where
10667                  * we need to serialize on the owner->perf_event_mutex.
10668                  */
10669                 smp_store_release(&event->owner, NULL);
10670         }
10671         mutex_unlock(&child->perf_event_mutex);
10672
10673         for_each_task_context_nr(ctxn)
10674                 perf_event_exit_task_context(child, ctxn);
10675
10676         /*
10677          * The perf_event_exit_task_context calls perf_event_task
10678          * with child's task_ctx, which generates EXIT events for
10679          * child contexts and sets child->perf_event_ctxp[] to NULL.
10680          * At this point we need to send EXIT events to cpu contexts.
10681          */
10682         perf_event_task(child, NULL, 0);
10683 }
10684
10685 static void perf_free_event(struct perf_event *event,
10686                             struct perf_event_context *ctx)
10687 {
10688         struct perf_event *parent = event->parent;
10689
10690         if (WARN_ON_ONCE(!parent))
10691                 return;
10692
10693         mutex_lock(&parent->child_mutex);
10694         list_del_init(&event->child_list);
10695         mutex_unlock(&parent->child_mutex);
10696
10697         put_event(parent);
10698
10699         raw_spin_lock_irq(&ctx->lock);
10700         perf_group_detach(event);
10701         list_del_event(event, ctx);
10702         raw_spin_unlock_irq(&ctx->lock);
10703         free_event(event);
10704 }
10705
10706 /*
10707  * Free an unexposed, unused context as created by inheritance by
10708  * perf_event_init_task below, used by fork() in case of fail.
10709  *
10710  * Not all locks are strictly required, but take them anyway to be nice and
10711  * help out with the lockdep assertions.
10712  */
10713 void perf_event_free_task(struct task_struct *task)
10714 {
10715         struct perf_event_context *ctx;
10716         struct perf_event *event, *tmp;
10717         int ctxn;
10718
10719         for_each_task_context_nr(ctxn) {
10720                 ctx = task->perf_event_ctxp[ctxn];
10721                 if (!ctx)
10722                         continue;
10723
10724                 mutex_lock(&ctx->mutex);
10725                 raw_spin_lock_irq(&ctx->lock);
10726                 /*
10727                  * Destroy the task <-> ctx relation and mark the context dead.
10728                  *
10729                  * This is important because even though the task hasn't been
10730                  * exposed yet the context has been (through child_list).
10731                  */
10732                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
10733                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
10734                 put_task_struct(task); /* cannot be last */
10735                 raw_spin_unlock_irq(&ctx->lock);
10736
10737                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
10738                         perf_free_event(event, ctx);
10739
10740                 mutex_unlock(&ctx->mutex);
10741                 put_ctx(ctx);
10742         }
10743 }
10744
10745 void perf_event_delayed_put(struct task_struct *task)
10746 {
10747         int ctxn;
10748
10749         for_each_task_context_nr(ctxn)
10750                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10751 }
10752
10753 struct file *perf_event_get(unsigned int fd)
10754 {
10755         struct file *file;
10756
10757         file = fget_raw(fd);
10758         if (!file)
10759                 return ERR_PTR(-EBADF);
10760
10761         if (file->f_op != &perf_fops) {
10762                 fput(file);
10763                 return ERR_PTR(-EBADF);
10764         }
10765
10766         return file;
10767 }
10768
10769 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10770 {
10771         if (!event)
10772                 return ERR_PTR(-EINVAL);
10773
10774         return &event->attr;
10775 }
10776
10777 /*
10778  * Inherit a event from parent task to child task.
10779  *
10780  * Returns:
10781  *  - valid pointer on success
10782  *  - NULL for orphaned events
10783  *  - IS_ERR() on error
10784  */
10785 static struct perf_event *
10786 inherit_event(struct perf_event *parent_event,
10787               struct task_struct *parent,
10788               struct perf_event_context *parent_ctx,
10789               struct task_struct *child,
10790               struct perf_event *group_leader,
10791               struct perf_event_context *child_ctx)
10792 {
10793         enum perf_event_active_state parent_state = parent_event->state;
10794         struct perf_event *child_event;
10795         unsigned long flags;
10796
10797         /*
10798          * Instead of creating recursive hierarchies of events,
10799          * we link inherited events back to the original parent,
10800          * which has a filp for sure, which we use as the reference
10801          * count:
10802          */
10803         if (parent_event->parent)
10804                 parent_event = parent_event->parent;
10805
10806         child_event = perf_event_alloc(&parent_event->attr,
10807                                            parent_event->cpu,
10808                                            child,
10809                                            group_leader, parent_event,
10810                                            NULL, NULL, -1);
10811         if (IS_ERR(child_event))
10812                 return child_event;
10813
10814         /*
10815          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
10816          * must be under the same lock in order to serialize against
10817          * perf_event_release_kernel(), such that either we must observe
10818          * is_orphaned_event() or they will observe us on the child_list.
10819          */
10820         mutex_lock(&parent_event->child_mutex);
10821         if (is_orphaned_event(parent_event) ||
10822             !atomic_long_inc_not_zero(&parent_event->refcount)) {
10823                 mutex_unlock(&parent_event->child_mutex);
10824                 free_event(child_event);
10825                 return NULL;
10826         }
10827
10828         get_ctx(child_ctx);
10829
10830         /*
10831          * Make the child state follow the state of the parent event,
10832          * not its attr.disabled bit.  We hold the parent's mutex,
10833          * so we won't race with perf_event_{en, dis}able_family.
10834          */
10835         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10836                 child_event->state = PERF_EVENT_STATE_INACTIVE;
10837         else
10838                 child_event->state = PERF_EVENT_STATE_OFF;
10839
10840         if (parent_event->attr.freq) {
10841                 u64 sample_period = parent_event->hw.sample_period;
10842                 struct hw_perf_event *hwc = &child_event->hw;
10843
10844                 hwc->sample_period = sample_period;
10845                 hwc->last_period   = sample_period;
10846
10847                 local64_set(&hwc->period_left, sample_period);
10848         }
10849
10850         child_event->ctx = child_ctx;
10851         child_event->overflow_handler = parent_event->overflow_handler;
10852         child_event->overflow_handler_context
10853                 = parent_event->overflow_handler_context;
10854
10855         /*
10856          * Precalculate sample_data sizes
10857          */
10858         perf_event__header_size(child_event);
10859         perf_event__id_header_size(child_event);
10860
10861         /*
10862          * Link it up in the child's context:
10863          */
10864         raw_spin_lock_irqsave(&child_ctx->lock, flags);
10865         add_event_to_ctx(child_event, child_ctx);
10866         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10867
10868         /*
10869          * Link this into the parent event's child list
10870          */
10871         list_add_tail(&child_event->child_list, &parent_event->child_list);
10872         mutex_unlock(&parent_event->child_mutex);
10873
10874         return child_event;
10875 }
10876
10877 /*
10878  * Inherits an event group.
10879  *
10880  * This will quietly suppress orphaned events; !inherit_event() is not an error.
10881  * This matches with perf_event_release_kernel() removing all child events.
10882  *
10883  * Returns:
10884  *  - 0 on success
10885  *  - <0 on error
10886  */
10887 static int inherit_group(struct perf_event *parent_event,
10888               struct task_struct *parent,
10889               struct perf_event_context *parent_ctx,
10890               struct task_struct *child,
10891               struct perf_event_context *child_ctx)
10892 {
10893         struct perf_event *leader;
10894         struct perf_event *sub;
10895         struct perf_event *child_ctr;
10896
10897         leader = inherit_event(parent_event, parent, parent_ctx,
10898                                  child, NULL, child_ctx);
10899         if (IS_ERR(leader))
10900                 return PTR_ERR(leader);
10901         /*
10902          * @leader can be NULL here because of is_orphaned_event(). In this
10903          * case inherit_event() will create individual events, similar to what
10904          * perf_group_detach() would do anyway.
10905          */
10906         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10907                 child_ctr = inherit_event(sub, parent, parent_ctx,
10908                                             child, leader, child_ctx);
10909                 if (IS_ERR(child_ctr))
10910                         return PTR_ERR(child_ctr);
10911         }
10912         return 0;
10913 }
10914
10915 /*
10916  * Creates the child task context and tries to inherit the event-group.
10917  *
10918  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
10919  * inherited_all set when we 'fail' to inherit an orphaned event; this is
10920  * consistent with perf_event_release_kernel() removing all child events.
10921  *
10922  * Returns:
10923  *  - 0 on success
10924  *  - <0 on error
10925  */
10926 static int
10927 inherit_task_group(struct perf_event *event, struct task_struct *parent,
10928                    struct perf_event_context *parent_ctx,
10929                    struct task_struct *child, int ctxn,
10930                    int *inherited_all)
10931 {
10932         int ret;
10933         struct perf_event_context *child_ctx;
10934
10935         if (!event->attr.inherit) {
10936                 *inherited_all = 0;
10937                 return 0;
10938         }
10939
10940         child_ctx = child->perf_event_ctxp[ctxn];
10941         if (!child_ctx) {
10942                 /*
10943                  * This is executed from the parent task context, so
10944                  * inherit events that have been marked for cloning.
10945                  * First allocate and initialize a context for the
10946                  * child.
10947                  */
10948                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
10949                 if (!child_ctx)
10950                         return -ENOMEM;
10951
10952                 child->perf_event_ctxp[ctxn] = child_ctx;
10953         }
10954
10955         ret = inherit_group(event, parent, parent_ctx,
10956                             child, child_ctx);
10957
10958         if (ret)
10959                 *inherited_all = 0;
10960
10961         return ret;
10962 }
10963
10964 /*
10965  * Initialize the perf_event context in task_struct
10966  */
10967 static int perf_event_init_context(struct task_struct *child, int ctxn)
10968 {
10969         struct perf_event_context *child_ctx, *parent_ctx;
10970         struct perf_event_context *cloned_ctx;
10971         struct perf_event *event;
10972         struct task_struct *parent = current;
10973         int inherited_all = 1;
10974         unsigned long flags;
10975         int ret = 0;
10976
10977         if (likely(!parent->perf_event_ctxp[ctxn]))
10978                 return 0;
10979
10980         /*
10981          * If the parent's context is a clone, pin it so it won't get
10982          * swapped under us.
10983          */
10984         parent_ctx = perf_pin_task_context(parent, ctxn);
10985         if (!parent_ctx)
10986                 return 0;
10987
10988         /*
10989          * No need to check if parent_ctx != NULL here; since we saw
10990          * it non-NULL earlier, the only reason for it to become NULL
10991          * is if we exit, and since we're currently in the middle of
10992          * a fork we can't be exiting at the same time.
10993          */
10994
10995         /*
10996          * Lock the parent list. No need to lock the child - not PID
10997          * hashed yet and not running, so nobody can access it.
10998          */
10999         mutex_lock(&parent_ctx->mutex);
11000
11001         /*
11002          * We dont have to disable NMIs - we are only looking at
11003          * the list, not manipulating it:
11004          */
11005         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
11006                 ret = inherit_task_group(event, parent, parent_ctx,
11007                                          child, ctxn, &inherited_all);
11008                 if (ret)
11009                         goto out_unlock;
11010         }
11011
11012         /*
11013          * We can't hold ctx->lock when iterating the ->flexible_group list due
11014          * to allocations, but we need to prevent rotation because
11015          * rotate_ctx() will change the list from interrupt context.
11016          */
11017         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11018         parent_ctx->rotate_disable = 1;
11019         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11020
11021         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
11022                 ret = inherit_task_group(event, parent, parent_ctx,
11023                                          child, ctxn, &inherited_all);
11024                 if (ret)
11025                         goto out_unlock;
11026         }
11027
11028         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11029         parent_ctx->rotate_disable = 0;
11030
11031         child_ctx = child->perf_event_ctxp[ctxn];
11032
11033         if (child_ctx && inherited_all) {
11034                 /*
11035                  * Mark the child context as a clone of the parent
11036                  * context, or of whatever the parent is a clone of.
11037                  *
11038                  * Note that if the parent is a clone, the holding of
11039                  * parent_ctx->lock avoids it from being uncloned.
11040                  */
11041                 cloned_ctx = parent_ctx->parent_ctx;
11042                 if (cloned_ctx) {
11043                         child_ctx->parent_ctx = cloned_ctx;
11044                         child_ctx->parent_gen = parent_ctx->parent_gen;
11045                 } else {
11046                         child_ctx->parent_ctx = parent_ctx;
11047                         child_ctx->parent_gen = parent_ctx->generation;
11048                 }
11049                 get_ctx(child_ctx->parent_ctx);
11050         }
11051
11052         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11053 out_unlock:
11054         mutex_unlock(&parent_ctx->mutex);
11055
11056         perf_unpin_context(parent_ctx);
11057         put_ctx(parent_ctx);
11058
11059         return ret;
11060 }
11061
11062 /*
11063  * Initialize the perf_event context in task_struct
11064  */
11065 int perf_event_init_task(struct task_struct *child)
11066 {
11067         int ctxn, ret;
11068
11069         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11070         mutex_init(&child->perf_event_mutex);
11071         INIT_LIST_HEAD(&child->perf_event_list);
11072
11073         for_each_task_context_nr(ctxn) {
11074                 ret = perf_event_init_context(child, ctxn);
11075                 if (ret) {
11076                         perf_event_free_task(child);
11077                         return ret;
11078                 }
11079         }
11080
11081         return 0;
11082 }
11083
11084 static void __init perf_event_init_all_cpus(void)
11085 {
11086         struct swevent_htable *swhash;
11087         int cpu;
11088
11089         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11090
11091         for_each_possible_cpu(cpu) {
11092                 swhash = &per_cpu(swevent_htable, cpu);
11093                 mutex_init(&swhash->hlist_mutex);
11094                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11095
11096                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11097                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11098
11099 #ifdef CONFIG_CGROUP_PERF
11100                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11101 #endif
11102                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11103         }
11104 }
11105
11106 void perf_swevent_init_cpu(unsigned int cpu)
11107 {
11108         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11109
11110         mutex_lock(&swhash->hlist_mutex);
11111         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11112                 struct swevent_hlist *hlist;
11113
11114                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11115                 WARN_ON(!hlist);
11116                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11117         }
11118         mutex_unlock(&swhash->hlist_mutex);
11119 }
11120
11121 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11122 static void __perf_event_exit_context(void *__info)
11123 {
11124         struct perf_event_context *ctx = __info;
11125         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11126         struct perf_event *event;
11127
11128         raw_spin_lock(&ctx->lock);
11129         list_for_each_entry(event, &ctx->event_list, event_entry)
11130                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11131         raw_spin_unlock(&ctx->lock);
11132 }
11133
11134 static void perf_event_exit_cpu_context(int cpu)
11135 {
11136         struct perf_cpu_context *cpuctx;
11137         struct perf_event_context *ctx;
11138         struct pmu *pmu;
11139
11140         mutex_lock(&pmus_lock);
11141         list_for_each_entry(pmu, &pmus, entry) {
11142                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11143                 ctx = &cpuctx->ctx;
11144
11145                 mutex_lock(&ctx->mutex);
11146                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11147                 cpuctx->online = 0;
11148                 mutex_unlock(&ctx->mutex);
11149         }
11150         cpumask_clear_cpu(cpu, perf_online_mask);
11151         mutex_unlock(&pmus_lock);
11152 }
11153 #else
11154
11155 static void perf_event_exit_cpu_context(int cpu) { }
11156
11157 #endif
11158
11159 int perf_event_init_cpu(unsigned int cpu)
11160 {
11161         struct perf_cpu_context *cpuctx;
11162         struct perf_event_context *ctx;
11163         struct pmu *pmu;
11164
11165         perf_swevent_init_cpu(cpu);
11166
11167         mutex_lock(&pmus_lock);
11168         cpumask_set_cpu(cpu, perf_online_mask);
11169         list_for_each_entry(pmu, &pmus, entry) {
11170                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11171                 ctx = &cpuctx->ctx;
11172
11173                 mutex_lock(&ctx->mutex);
11174                 cpuctx->online = 1;
11175                 mutex_unlock(&ctx->mutex);
11176         }
11177         mutex_unlock(&pmus_lock);
11178
11179         return 0;
11180 }
11181
11182 int perf_event_exit_cpu(unsigned int cpu)
11183 {
11184         perf_event_exit_cpu_context(cpu);
11185         return 0;
11186 }
11187
11188 static int
11189 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11190 {
11191         int cpu;
11192
11193         for_each_online_cpu(cpu)
11194                 perf_event_exit_cpu(cpu);
11195
11196         return NOTIFY_OK;
11197 }
11198
11199 /*
11200  * Run the perf reboot notifier at the very last possible moment so that
11201  * the generic watchdog code runs as long as possible.
11202  */
11203 static struct notifier_block perf_reboot_notifier = {
11204         .notifier_call = perf_reboot,
11205         .priority = INT_MIN,
11206 };
11207
11208 void __init perf_event_init(void)
11209 {
11210         int ret;
11211
11212         idr_init(&pmu_idr);
11213
11214         perf_event_init_all_cpus();
11215         init_srcu_struct(&pmus_srcu);
11216         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11217         perf_pmu_register(&perf_cpu_clock, NULL, -1);
11218         perf_pmu_register(&perf_task_clock, NULL, -1);
11219         perf_tp_register();
11220         perf_event_init_cpu(smp_processor_id());
11221         register_reboot_notifier(&perf_reboot_notifier);
11222
11223         ret = init_hw_breakpoint();
11224         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11225
11226         /*
11227          * Build time assertion that we keep the data_head at the intended
11228          * location.  IOW, validation we got the __reserved[] size right.
11229          */
11230         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11231                      != 1024);
11232 }
11233
11234 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11235                               char *page)
11236 {
11237         struct perf_pmu_events_attr *pmu_attr =
11238                 container_of(attr, struct perf_pmu_events_attr, attr);
11239
11240         if (pmu_attr->event_str)
11241                 return sprintf(page, "%s\n", pmu_attr->event_str);
11242
11243         return 0;
11244 }
11245 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11246
11247 static int __init perf_event_sysfs_init(void)
11248 {
11249         struct pmu *pmu;
11250         int ret;
11251
11252         mutex_lock(&pmus_lock);
11253
11254         ret = bus_register(&pmu_bus);
11255         if (ret)
11256                 goto unlock;
11257
11258         list_for_each_entry(pmu, &pmus, entry) {
11259                 if (!pmu->name || pmu->type < 0)
11260                         continue;
11261
11262                 ret = pmu_dev_alloc(pmu);
11263                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11264         }
11265         pmu_bus_running = 1;
11266         ret = 0;
11267
11268 unlock:
11269         mutex_unlock(&pmus_lock);
11270
11271         return ret;
11272 }
11273 device_initcall(perf_event_sysfs_init);
11274
11275 #ifdef CONFIG_CGROUP_PERF
11276 static struct cgroup_subsys_state *
11277 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
11278 {
11279         struct perf_cgroup *jc;
11280
11281         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
11282         if (!jc)
11283                 return ERR_PTR(-ENOMEM);
11284
11285         jc->info = alloc_percpu(struct perf_cgroup_info);
11286         if (!jc->info) {
11287                 kfree(jc);
11288                 return ERR_PTR(-ENOMEM);
11289         }
11290
11291         return &jc->css;
11292 }
11293
11294 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
11295 {
11296         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
11297
11298         free_percpu(jc->info);
11299         kfree(jc);
11300 }
11301
11302 static int __perf_cgroup_move(void *info)
11303 {
11304         struct task_struct *task = info;
11305         rcu_read_lock();
11306         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
11307         rcu_read_unlock();
11308         return 0;
11309 }
11310
11311 static void perf_cgroup_attach(struct cgroup_taskset *tset)
11312 {
11313         struct task_struct *task;
11314         struct cgroup_subsys_state *css;
11315
11316         cgroup_taskset_for_each(task, css, tset)
11317                 task_function_call(task, __perf_cgroup_move, task);
11318 }
11319
11320 struct cgroup_subsys perf_event_cgrp_subsys = {
11321         .css_alloc      = perf_cgroup_css_alloc,
11322         .css_free       = perf_cgroup_css_free,
11323         .attach         = perf_cgroup_attach,
11324         /*
11325          * Implicitly enable on dfl hierarchy so that perf events can
11326          * always be filtered by cgroup2 path as long as perf_event
11327          * controller is not mounted on a legacy hierarchy.
11328          */
11329         .implicit_on_dfl = true,
11330         .threaded       = true,
11331 };
11332 #endif /* CONFIG_CGROUP_PERF */