perf_counter: software counter event infrastructure
[platform/kernel/linux-starfive.git] / kernel / perf_counter.c
1 /*
2  * Performance counter core code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *
7  *  For licencing details see kernel-base/COPYING
8  */
9
10 #include <linux/fs.h>
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/perf_counter.h>
23 #include <linux/mm.h>
24 #include <linux/vmstat.h>
25
26 /*
27  * Each CPU has a list of per CPU counters:
28  */
29 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
30
31 int perf_max_counters __read_mostly = 1;
32 static int perf_reserved_percpu __read_mostly;
33 static int perf_overcommit __read_mostly = 1;
34
35 /*
36  * Mutex for (sysadmin-configurable) counter reservations:
37  */
38 static DEFINE_MUTEX(perf_resource_mutex);
39
40 /*
41  * Architecture provided APIs - weak aliases:
42  */
43 extern __weak const struct hw_perf_counter_ops *
44 hw_perf_counter_init(struct perf_counter *counter)
45 {
46         return NULL;
47 }
48
49 u64 __weak hw_perf_save_disable(void)           { return 0; }
50 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
51 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
52 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
53                struct perf_cpu_context *cpuctx,
54                struct perf_counter_context *ctx, int cpu)
55 {
56         return 0;
57 }
58
59 void __weak perf_counter_print_debug(void)      { }
60
61 static void
62 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
63 {
64         struct perf_counter *group_leader = counter->group_leader;
65
66         /*
67          * Depending on whether it is a standalone or sibling counter,
68          * add it straight to the context's counter list, or to the group
69          * leader's sibling list:
70          */
71         if (counter->group_leader == counter)
72                 list_add_tail(&counter->list_entry, &ctx->counter_list);
73         else
74                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
75 }
76
77 static void
78 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
79 {
80         struct perf_counter *sibling, *tmp;
81
82         list_del_init(&counter->list_entry);
83
84         /*
85          * If this was a group counter with sibling counters then
86          * upgrade the siblings to singleton counters by adding them
87          * to the context list directly:
88          */
89         list_for_each_entry_safe(sibling, tmp,
90                                  &counter->sibling_list, list_entry) {
91
92                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
93                 sibling->group_leader = sibling;
94         }
95 }
96
97 static void
98 counter_sched_out(struct perf_counter *counter,
99                   struct perf_cpu_context *cpuctx,
100                   struct perf_counter_context *ctx)
101 {
102         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
103                 return;
104
105         counter->state = PERF_COUNTER_STATE_INACTIVE;
106         counter->hw_ops->disable(counter);
107         counter->oncpu = -1;
108
109         if (!is_software_counter(counter))
110                 cpuctx->active_oncpu--;
111         ctx->nr_active--;
112         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
113                 cpuctx->exclusive = 0;
114 }
115
116 static void
117 group_sched_out(struct perf_counter *group_counter,
118                 struct perf_cpu_context *cpuctx,
119                 struct perf_counter_context *ctx)
120 {
121         struct perf_counter *counter;
122
123         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
124                 return;
125
126         counter_sched_out(group_counter, cpuctx, ctx);
127
128         /*
129          * Schedule out siblings (if any):
130          */
131         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
132                 counter_sched_out(counter, cpuctx, ctx);
133
134         if (group_counter->hw_event.exclusive)
135                 cpuctx->exclusive = 0;
136 }
137
138 /*
139  * Cross CPU call to remove a performance counter
140  *
141  * We disable the counter on the hardware level first. After that we
142  * remove it from the context list.
143  */
144 static void __perf_counter_remove_from_context(void *info)
145 {
146         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
147         struct perf_counter *counter = info;
148         struct perf_counter_context *ctx = counter->ctx;
149         unsigned long flags;
150         u64 perf_flags;
151
152         /*
153          * If this is a task context, we need to check whether it is
154          * the current task context of this cpu. If not it has been
155          * scheduled out before the smp call arrived.
156          */
157         if (ctx->task && cpuctx->task_ctx != ctx)
158                 return;
159
160         curr_rq_lock_irq_save(&flags);
161         spin_lock(&ctx->lock);
162
163         counter_sched_out(counter, cpuctx, ctx);
164
165         counter->task = NULL;
166         ctx->nr_counters--;
167
168         /*
169          * Protect the list operation against NMI by disabling the
170          * counters on a global level. NOP for non NMI based counters.
171          */
172         perf_flags = hw_perf_save_disable();
173         list_del_counter(counter, ctx);
174         hw_perf_restore(perf_flags);
175
176         if (!ctx->task) {
177                 /*
178                  * Allow more per task counters with respect to the
179                  * reservation:
180                  */
181                 cpuctx->max_pertask =
182                         min(perf_max_counters - ctx->nr_counters,
183                             perf_max_counters - perf_reserved_percpu);
184         }
185
186         spin_unlock(&ctx->lock);
187         curr_rq_unlock_irq_restore(&flags);
188 }
189
190
191 /*
192  * Remove the counter from a task's (or a CPU's) list of counters.
193  *
194  * Must be called with counter->mutex and ctx->mutex held.
195  *
196  * CPU counters are removed with a smp call. For task counters we only
197  * call when the task is on a CPU.
198  */
199 static void perf_counter_remove_from_context(struct perf_counter *counter)
200 {
201         struct perf_counter_context *ctx = counter->ctx;
202         struct task_struct *task = ctx->task;
203
204         if (!task) {
205                 /*
206                  * Per cpu counters are removed via an smp call and
207                  * the removal is always sucessful.
208                  */
209                 smp_call_function_single(counter->cpu,
210                                          __perf_counter_remove_from_context,
211                                          counter, 1);
212                 return;
213         }
214
215 retry:
216         task_oncpu_function_call(task, __perf_counter_remove_from_context,
217                                  counter);
218
219         spin_lock_irq(&ctx->lock);
220         /*
221          * If the context is active we need to retry the smp call.
222          */
223         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
224                 spin_unlock_irq(&ctx->lock);
225                 goto retry;
226         }
227
228         /*
229          * The lock prevents that this context is scheduled in so we
230          * can remove the counter safely, if the call above did not
231          * succeed.
232          */
233         if (!list_empty(&counter->list_entry)) {
234                 ctx->nr_counters--;
235                 list_del_counter(counter, ctx);
236                 counter->task = NULL;
237         }
238         spin_unlock_irq(&ctx->lock);
239 }
240
241 /*
242  * Cross CPU call to disable a performance counter
243  */
244 static void __perf_counter_disable(void *info)
245 {
246         struct perf_counter *counter = info;
247         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
248         struct perf_counter_context *ctx = counter->ctx;
249         unsigned long flags;
250
251         /*
252          * If this is a per-task counter, need to check whether this
253          * counter's task is the current task on this cpu.
254          */
255         if (ctx->task && cpuctx->task_ctx != ctx)
256                 return;
257
258         curr_rq_lock_irq_save(&flags);
259         spin_lock(&ctx->lock);
260
261         /*
262          * If the counter is on, turn it off.
263          * If it is in error state, leave it in error state.
264          */
265         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
266                 if (counter == counter->group_leader)
267                         group_sched_out(counter, cpuctx, ctx);
268                 else
269                         counter_sched_out(counter, cpuctx, ctx);
270                 counter->state = PERF_COUNTER_STATE_OFF;
271         }
272
273         spin_unlock(&ctx->lock);
274         curr_rq_unlock_irq_restore(&flags);
275 }
276
277 /*
278  * Disable a counter.
279  */
280 static void perf_counter_disable(struct perf_counter *counter)
281 {
282         struct perf_counter_context *ctx = counter->ctx;
283         struct task_struct *task = ctx->task;
284
285         if (!task) {
286                 /*
287                  * Disable the counter on the cpu that it's on
288                  */
289                 smp_call_function_single(counter->cpu, __perf_counter_disable,
290                                          counter, 1);
291                 return;
292         }
293
294  retry:
295         task_oncpu_function_call(task, __perf_counter_disable, counter);
296
297         spin_lock_irq(&ctx->lock);
298         /*
299          * If the counter is still active, we need to retry the cross-call.
300          */
301         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
302                 spin_unlock_irq(&ctx->lock);
303                 goto retry;
304         }
305
306         /*
307          * Since we have the lock this context can't be scheduled
308          * in, so we can change the state safely.
309          */
310         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
311                 counter->state = PERF_COUNTER_STATE_OFF;
312
313         spin_unlock_irq(&ctx->lock);
314 }
315
316 /*
317  * Disable a counter and all its children.
318  */
319 static void perf_counter_disable_family(struct perf_counter *counter)
320 {
321         struct perf_counter *child;
322
323         perf_counter_disable(counter);
324
325         /*
326          * Lock the mutex to protect the list of children
327          */
328         mutex_lock(&counter->mutex);
329         list_for_each_entry(child, &counter->child_list, child_list)
330                 perf_counter_disable(child);
331         mutex_unlock(&counter->mutex);
332 }
333
334 static int
335 counter_sched_in(struct perf_counter *counter,
336                  struct perf_cpu_context *cpuctx,
337                  struct perf_counter_context *ctx,
338                  int cpu)
339 {
340         if (counter->state <= PERF_COUNTER_STATE_OFF)
341                 return 0;
342
343         counter->state = PERF_COUNTER_STATE_ACTIVE;
344         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
345         /*
346          * The new state must be visible before we turn it on in the hardware:
347          */
348         smp_wmb();
349
350         if (counter->hw_ops->enable(counter)) {
351                 counter->state = PERF_COUNTER_STATE_INACTIVE;
352                 counter->oncpu = -1;
353                 return -EAGAIN;
354         }
355
356         if (!is_software_counter(counter))
357                 cpuctx->active_oncpu++;
358         ctx->nr_active++;
359
360         if (counter->hw_event.exclusive)
361                 cpuctx->exclusive = 1;
362
363         return 0;
364 }
365
366 /*
367  * Return 1 for a group consisting entirely of software counters,
368  * 0 if the group contains any hardware counters.
369  */
370 static int is_software_only_group(struct perf_counter *leader)
371 {
372         struct perf_counter *counter;
373
374         if (!is_software_counter(leader))
375                 return 0;
376         list_for_each_entry(counter, &leader->sibling_list, list_entry)
377                 if (!is_software_counter(counter))
378                         return 0;
379         return 1;
380 }
381
382 /*
383  * Work out whether we can put this counter group on the CPU now.
384  */
385 static int group_can_go_on(struct perf_counter *counter,
386                            struct perf_cpu_context *cpuctx,
387                            int can_add_hw)
388 {
389         /*
390          * Groups consisting entirely of software counters can always go on.
391          */
392         if (is_software_only_group(counter))
393                 return 1;
394         /*
395          * If an exclusive group is already on, no other hardware
396          * counters can go on.
397          */
398         if (cpuctx->exclusive)
399                 return 0;
400         /*
401          * If this group is exclusive and there are already
402          * counters on the CPU, it can't go on.
403          */
404         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
405                 return 0;
406         /*
407          * Otherwise, try to add it if all previous groups were able
408          * to go on.
409          */
410         return can_add_hw;
411 }
412
413 /*
414  * Cross CPU call to install and enable a performance counter
415  */
416 static void __perf_install_in_context(void *info)
417 {
418         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
419         struct perf_counter *counter = info;
420         struct perf_counter_context *ctx = counter->ctx;
421         struct perf_counter *leader = counter->group_leader;
422         int cpu = smp_processor_id();
423         unsigned long flags;
424         u64 perf_flags;
425         int err;
426
427         /*
428          * If this is a task context, we need to check whether it is
429          * the current task context of this cpu. If not it has been
430          * scheduled out before the smp call arrived.
431          */
432         if (ctx->task && cpuctx->task_ctx != ctx)
433                 return;
434
435         curr_rq_lock_irq_save(&flags);
436         spin_lock(&ctx->lock);
437
438         /*
439          * Protect the list operation against NMI by disabling the
440          * counters on a global level. NOP for non NMI based counters.
441          */
442         perf_flags = hw_perf_save_disable();
443
444         list_add_counter(counter, ctx);
445         ctx->nr_counters++;
446         counter->prev_state = PERF_COUNTER_STATE_OFF;
447
448         /*
449          * Don't put the counter on if it is disabled or if
450          * it is in a group and the group isn't on.
451          */
452         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
453             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
454                 goto unlock;
455
456         /*
457          * An exclusive counter can't go on if there are already active
458          * hardware counters, and no hardware counter can go on if there
459          * is already an exclusive counter on.
460          */
461         if (!group_can_go_on(counter, cpuctx, 1))
462                 err = -EEXIST;
463         else
464                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
465
466         if (err) {
467                 /*
468                  * This counter couldn't go on.  If it is in a group
469                  * then we have to pull the whole group off.
470                  * If the counter group is pinned then put it in error state.
471                  */
472                 if (leader != counter)
473                         group_sched_out(leader, cpuctx, ctx);
474                 if (leader->hw_event.pinned)
475                         leader->state = PERF_COUNTER_STATE_ERROR;
476         }
477
478         if (!err && !ctx->task && cpuctx->max_pertask)
479                 cpuctx->max_pertask--;
480
481  unlock:
482         hw_perf_restore(perf_flags);
483
484         spin_unlock(&ctx->lock);
485         curr_rq_unlock_irq_restore(&flags);
486 }
487
488 /*
489  * Attach a performance counter to a context
490  *
491  * First we add the counter to the list with the hardware enable bit
492  * in counter->hw_config cleared.
493  *
494  * If the counter is attached to a task which is on a CPU we use a smp
495  * call to enable it in the task context. The task might have been
496  * scheduled away, but we check this in the smp call again.
497  *
498  * Must be called with ctx->mutex held.
499  */
500 static void
501 perf_install_in_context(struct perf_counter_context *ctx,
502                         struct perf_counter *counter,
503                         int cpu)
504 {
505         struct task_struct *task = ctx->task;
506
507         if (!task) {
508                 /*
509                  * Per cpu counters are installed via an smp call and
510                  * the install is always sucessful.
511                  */
512                 smp_call_function_single(cpu, __perf_install_in_context,
513                                          counter, 1);
514                 return;
515         }
516
517         counter->task = task;
518 retry:
519         task_oncpu_function_call(task, __perf_install_in_context,
520                                  counter);
521
522         spin_lock_irq(&ctx->lock);
523         /*
524          * we need to retry the smp call.
525          */
526         if (ctx->is_active && list_empty(&counter->list_entry)) {
527                 spin_unlock_irq(&ctx->lock);
528                 goto retry;
529         }
530
531         /*
532          * The lock prevents that this context is scheduled in so we
533          * can add the counter safely, if it the call above did not
534          * succeed.
535          */
536         if (list_empty(&counter->list_entry)) {
537                 list_add_counter(counter, ctx);
538                 ctx->nr_counters++;
539         }
540         spin_unlock_irq(&ctx->lock);
541 }
542
543 /*
544  * Cross CPU call to enable a performance counter
545  */
546 static void __perf_counter_enable(void *info)
547 {
548         struct perf_counter *counter = info;
549         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
550         struct perf_counter_context *ctx = counter->ctx;
551         struct perf_counter *leader = counter->group_leader;
552         unsigned long flags;
553         int err;
554
555         /*
556          * If this is a per-task counter, need to check whether this
557          * counter's task is the current task on this cpu.
558          */
559         if (ctx->task && cpuctx->task_ctx != ctx)
560                 return;
561
562         curr_rq_lock_irq_save(&flags);
563         spin_lock(&ctx->lock);
564
565         counter->prev_state = counter->state;
566         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
567                 goto unlock;
568         counter->state = PERF_COUNTER_STATE_INACTIVE;
569
570         /*
571          * If the counter is in a group and isn't the group leader,
572          * then don't put it on unless the group is on.
573          */
574         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
575                 goto unlock;
576
577         if (!group_can_go_on(counter, cpuctx, 1))
578                 err = -EEXIST;
579         else
580                 err = counter_sched_in(counter, cpuctx, ctx,
581                                        smp_processor_id());
582
583         if (err) {
584                 /*
585                  * If this counter can't go on and it's part of a
586                  * group, then the whole group has to come off.
587                  */
588                 if (leader != counter)
589                         group_sched_out(leader, cpuctx, ctx);
590                 if (leader->hw_event.pinned)
591                         leader->state = PERF_COUNTER_STATE_ERROR;
592         }
593
594  unlock:
595         spin_unlock(&ctx->lock);
596         curr_rq_unlock_irq_restore(&flags);
597 }
598
599 /*
600  * Enable a counter.
601  */
602 static void perf_counter_enable(struct perf_counter *counter)
603 {
604         struct perf_counter_context *ctx = counter->ctx;
605         struct task_struct *task = ctx->task;
606
607         if (!task) {
608                 /*
609                  * Enable the counter on the cpu that it's on
610                  */
611                 smp_call_function_single(counter->cpu, __perf_counter_enable,
612                                          counter, 1);
613                 return;
614         }
615
616         spin_lock_irq(&ctx->lock);
617         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
618                 goto out;
619
620         /*
621          * If the counter is in error state, clear that first.
622          * That way, if we see the counter in error state below, we
623          * know that it has gone back into error state, as distinct
624          * from the task having been scheduled away before the
625          * cross-call arrived.
626          */
627         if (counter->state == PERF_COUNTER_STATE_ERROR)
628                 counter->state = PERF_COUNTER_STATE_OFF;
629
630  retry:
631         spin_unlock_irq(&ctx->lock);
632         task_oncpu_function_call(task, __perf_counter_enable, counter);
633
634         spin_lock_irq(&ctx->lock);
635
636         /*
637          * If the context is active and the counter is still off,
638          * we need to retry the cross-call.
639          */
640         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
641                 goto retry;
642
643         /*
644          * Since we have the lock this context can't be scheduled
645          * in, so we can change the state safely.
646          */
647         if (counter->state == PERF_COUNTER_STATE_OFF)
648                 counter->state = PERF_COUNTER_STATE_INACTIVE;
649  out:
650         spin_unlock_irq(&ctx->lock);
651 }
652
653 /*
654  * Enable a counter and all its children.
655  */
656 static void perf_counter_enable_family(struct perf_counter *counter)
657 {
658         struct perf_counter *child;
659
660         perf_counter_enable(counter);
661
662         /*
663          * Lock the mutex to protect the list of children
664          */
665         mutex_lock(&counter->mutex);
666         list_for_each_entry(child, &counter->child_list, child_list)
667                 perf_counter_enable(child);
668         mutex_unlock(&counter->mutex);
669 }
670
671 void __perf_counter_sched_out(struct perf_counter_context *ctx,
672                               struct perf_cpu_context *cpuctx)
673 {
674         struct perf_counter *counter;
675         u64 flags;
676
677         spin_lock(&ctx->lock);
678         ctx->is_active = 0;
679         if (likely(!ctx->nr_counters))
680                 goto out;
681
682         flags = hw_perf_save_disable();
683         if (ctx->nr_active) {
684                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
685                         group_sched_out(counter, cpuctx, ctx);
686         }
687         hw_perf_restore(flags);
688  out:
689         spin_unlock(&ctx->lock);
690 }
691
692 /*
693  * Called from scheduler to remove the counters of the current task,
694  * with interrupts disabled.
695  *
696  * We stop each counter and update the counter value in counter->count.
697  *
698  * This does not protect us against NMI, but disable()
699  * sets the disabled bit in the control field of counter _before_
700  * accessing the counter control register. If a NMI hits, then it will
701  * not restart the counter.
702  */
703 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
704 {
705         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
706         struct perf_counter_context *ctx = &task->perf_counter_ctx;
707
708         if (likely(!cpuctx->task_ctx))
709                 return;
710
711         __perf_counter_sched_out(ctx, cpuctx);
712
713         cpuctx->task_ctx = NULL;
714 }
715
716 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
717 {
718         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
719 }
720
721 static int
722 group_sched_in(struct perf_counter *group_counter,
723                struct perf_cpu_context *cpuctx,
724                struct perf_counter_context *ctx,
725                int cpu)
726 {
727         struct perf_counter *counter, *partial_group;
728         int ret;
729
730         if (group_counter->state == PERF_COUNTER_STATE_OFF)
731                 return 0;
732
733         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
734         if (ret)
735                 return ret < 0 ? ret : 0;
736
737         group_counter->prev_state = group_counter->state;
738         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
739                 return -EAGAIN;
740
741         /*
742          * Schedule in siblings as one group (if any):
743          */
744         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
745                 counter->prev_state = counter->state;
746                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
747                         partial_group = counter;
748                         goto group_error;
749                 }
750         }
751
752         return 0;
753
754 group_error:
755         /*
756          * Groups can be scheduled in as one unit only, so undo any
757          * partial group before returning:
758          */
759         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
760                 if (counter == partial_group)
761                         break;
762                 counter_sched_out(counter, cpuctx, ctx);
763         }
764         counter_sched_out(group_counter, cpuctx, ctx);
765
766         return -EAGAIN;
767 }
768
769 static void
770 __perf_counter_sched_in(struct perf_counter_context *ctx,
771                         struct perf_cpu_context *cpuctx, int cpu)
772 {
773         struct perf_counter *counter;
774         u64 flags;
775         int can_add_hw = 1;
776
777         spin_lock(&ctx->lock);
778         ctx->is_active = 1;
779         if (likely(!ctx->nr_counters))
780                 goto out;
781
782         flags = hw_perf_save_disable();
783
784         /*
785          * First go through the list and put on any pinned groups
786          * in order to give them the best chance of going on.
787          */
788         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
789                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
790                     !counter->hw_event.pinned)
791                         continue;
792                 if (counter->cpu != -1 && counter->cpu != cpu)
793                         continue;
794
795                 if (group_can_go_on(counter, cpuctx, 1))
796                         group_sched_in(counter, cpuctx, ctx, cpu);
797
798                 /*
799                  * If this pinned group hasn't been scheduled,
800                  * put it in error state.
801                  */
802                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
803                         counter->state = PERF_COUNTER_STATE_ERROR;
804         }
805
806         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
807                 /*
808                  * Ignore counters in OFF or ERROR state, and
809                  * ignore pinned counters since we did them already.
810                  */
811                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
812                     counter->hw_event.pinned)
813                         continue;
814
815                 /*
816                  * Listen to the 'cpu' scheduling filter constraint
817                  * of counters:
818                  */
819                 if (counter->cpu != -1 && counter->cpu != cpu)
820                         continue;
821
822                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
823                         if (group_sched_in(counter, cpuctx, ctx, cpu))
824                                 can_add_hw = 0;
825                 }
826         }
827         hw_perf_restore(flags);
828  out:
829         spin_unlock(&ctx->lock);
830 }
831
832 /*
833  * Called from scheduler to add the counters of the current task
834  * with interrupts disabled.
835  *
836  * We restore the counter value and then enable it.
837  *
838  * This does not protect us against NMI, but enable()
839  * sets the enabled bit in the control field of counter _before_
840  * accessing the counter control register. If a NMI hits, then it will
841  * keep the counter running.
842  */
843 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
844 {
845         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
846         struct perf_counter_context *ctx = &task->perf_counter_ctx;
847
848         __perf_counter_sched_in(ctx, cpuctx, cpu);
849         cpuctx->task_ctx = ctx;
850 }
851
852 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
853 {
854         struct perf_counter_context *ctx = &cpuctx->ctx;
855
856         __perf_counter_sched_in(ctx, cpuctx, cpu);
857 }
858
859 int perf_counter_task_disable(void)
860 {
861         struct task_struct *curr = current;
862         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
863         struct perf_counter *counter;
864         unsigned long flags;
865         u64 perf_flags;
866         int cpu;
867
868         if (likely(!ctx->nr_counters))
869                 return 0;
870
871         curr_rq_lock_irq_save(&flags);
872         cpu = smp_processor_id();
873
874         /* force the update of the task clock: */
875         __task_delta_exec(curr, 1);
876
877         perf_counter_task_sched_out(curr, cpu);
878
879         spin_lock(&ctx->lock);
880
881         /*
882          * Disable all the counters:
883          */
884         perf_flags = hw_perf_save_disable();
885
886         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
887                 if (counter->state != PERF_COUNTER_STATE_ERROR)
888                         counter->state = PERF_COUNTER_STATE_OFF;
889         }
890
891         hw_perf_restore(perf_flags);
892
893         spin_unlock(&ctx->lock);
894
895         curr_rq_unlock_irq_restore(&flags);
896
897         return 0;
898 }
899
900 int perf_counter_task_enable(void)
901 {
902         struct task_struct *curr = current;
903         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
904         struct perf_counter *counter;
905         unsigned long flags;
906         u64 perf_flags;
907         int cpu;
908
909         if (likely(!ctx->nr_counters))
910                 return 0;
911
912         curr_rq_lock_irq_save(&flags);
913         cpu = smp_processor_id();
914
915         /* force the update of the task clock: */
916         __task_delta_exec(curr, 1);
917
918         perf_counter_task_sched_out(curr, cpu);
919
920         spin_lock(&ctx->lock);
921
922         /*
923          * Disable all the counters:
924          */
925         perf_flags = hw_perf_save_disable();
926
927         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
928                 if (counter->state > PERF_COUNTER_STATE_OFF)
929                         continue;
930                 counter->state = PERF_COUNTER_STATE_INACTIVE;
931                 counter->hw_event.disabled = 0;
932         }
933         hw_perf_restore(perf_flags);
934
935         spin_unlock(&ctx->lock);
936
937         perf_counter_task_sched_in(curr, cpu);
938
939         curr_rq_unlock_irq_restore(&flags);
940
941         return 0;
942 }
943
944 /*
945  * Round-robin a context's counters:
946  */
947 static void rotate_ctx(struct perf_counter_context *ctx)
948 {
949         struct perf_counter *counter;
950         u64 perf_flags;
951
952         if (!ctx->nr_counters)
953                 return;
954
955         spin_lock(&ctx->lock);
956         /*
957          * Rotate the first entry last (works just fine for group counters too):
958          */
959         perf_flags = hw_perf_save_disable();
960         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
961                 list_move_tail(&counter->list_entry, &ctx->counter_list);
962                 break;
963         }
964         hw_perf_restore(perf_flags);
965
966         spin_unlock(&ctx->lock);
967 }
968
969 void perf_counter_task_tick(struct task_struct *curr, int cpu)
970 {
971         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
972         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
973         const int rotate_percpu = 0;
974
975         if (rotate_percpu)
976                 perf_counter_cpu_sched_out(cpuctx);
977         perf_counter_task_sched_out(curr, cpu);
978
979         if (rotate_percpu)
980                 rotate_ctx(&cpuctx->ctx);
981         rotate_ctx(ctx);
982
983         if (rotate_percpu)
984                 perf_counter_cpu_sched_in(cpuctx, cpu);
985         perf_counter_task_sched_in(curr, cpu);
986 }
987
988 /*
989  * Cross CPU call to read the hardware counter
990  */
991 static void __read(void *info)
992 {
993         struct perf_counter *counter = info;
994         unsigned long flags;
995
996         curr_rq_lock_irq_save(&flags);
997         counter->hw_ops->read(counter);
998         curr_rq_unlock_irq_restore(&flags);
999 }
1000
1001 static u64 perf_counter_read(struct perf_counter *counter)
1002 {
1003         /*
1004          * If counter is enabled and currently active on a CPU, update the
1005          * value in the counter structure:
1006          */
1007         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1008                 smp_call_function_single(counter->oncpu,
1009                                          __read, counter, 1);
1010         }
1011
1012         return atomic64_read(&counter->count);
1013 }
1014
1015 /*
1016  * Cross CPU call to switch performance data pointers
1017  */
1018 static void __perf_switch_irq_data(void *info)
1019 {
1020         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1021         struct perf_counter *counter = info;
1022         struct perf_counter_context *ctx = counter->ctx;
1023         struct perf_data *oldirqdata = counter->irqdata;
1024
1025         /*
1026          * If this is a task context, we need to check whether it is
1027          * the current task context of this cpu. If not it has been
1028          * scheduled out before the smp call arrived.
1029          */
1030         if (ctx->task) {
1031                 if (cpuctx->task_ctx != ctx)
1032                         return;
1033                 spin_lock(&ctx->lock);
1034         }
1035
1036         /* Change the pointer NMI safe */
1037         atomic_long_set((atomic_long_t *)&counter->irqdata,
1038                         (unsigned long) counter->usrdata);
1039         counter->usrdata = oldirqdata;
1040
1041         if (ctx->task)
1042                 spin_unlock(&ctx->lock);
1043 }
1044
1045 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1046 {
1047         struct perf_counter_context *ctx = counter->ctx;
1048         struct perf_data *oldirqdata = counter->irqdata;
1049         struct task_struct *task = ctx->task;
1050
1051         if (!task) {
1052                 smp_call_function_single(counter->cpu,
1053                                          __perf_switch_irq_data,
1054                                          counter, 1);
1055                 return counter->usrdata;
1056         }
1057
1058 retry:
1059         spin_lock_irq(&ctx->lock);
1060         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1061                 counter->irqdata = counter->usrdata;
1062                 counter->usrdata = oldirqdata;
1063                 spin_unlock_irq(&ctx->lock);
1064                 return oldirqdata;
1065         }
1066         spin_unlock_irq(&ctx->lock);
1067         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1068         /* Might have failed, because task was scheduled out */
1069         if (counter->irqdata == oldirqdata)
1070                 goto retry;
1071
1072         return counter->usrdata;
1073 }
1074
1075 static void put_context(struct perf_counter_context *ctx)
1076 {
1077         if (ctx->task)
1078                 put_task_struct(ctx->task);
1079 }
1080
1081 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1082 {
1083         struct perf_cpu_context *cpuctx;
1084         struct perf_counter_context *ctx;
1085         struct task_struct *task;
1086
1087         /*
1088          * If cpu is not a wildcard then this is a percpu counter:
1089          */
1090         if (cpu != -1) {
1091                 /* Must be root to operate on a CPU counter: */
1092                 if (!capable(CAP_SYS_ADMIN))
1093                         return ERR_PTR(-EACCES);
1094
1095                 if (cpu < 0 || cpu > num_possible_cpus())
1096                         return ERR_PTR(-EINVAL);
1097
1098                 /*
1099                  * We could be clever and allow to attach a counter to an
1100                  * offline CPU and activate it when the CPU comes up, but
1101                  * that's for later.
1102                  */
1103                 if (!cpu_isset(cpu, cpu_online_map))
1104                         return ERR_PTR(-ENODEV);
1105
1106                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1107                 ctx = &cpuctx->ctx;
1108
1109                 return ctx;
1110         }
1111
1112         rcu_read_lock();
1113         if (!pid)
1114                 task = current;
1115         else
1116                 task = find_task_by_vpid(pid);
1117         if (task)
1118                 get_task_struct(task);
1119         rcu_read_unlock();
1120
1121         if (!task)
1122                 return ERR_PTR(-ESRCH);
1123
1124         ctx = &task->perf_counter_ctx;
1125         ctx->task = task;
1126
1127         /* Reuse ptrace permission checks for now. */
1128         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1129                 put_context(ctx);
1130                 return ERR_PTR(-EACCES);
1131         }
1132
1133         return ctx;
1134 }
1135
1136 /*
1137  * Called when the last reference to the file is gone.
1138  */
1139 static int perf_release(struct inode *inode, struct file *file)
1140 {
1141         struct perf_counter *counter = file->private_data;
1142         struct perf_counter_context *ctx = counter->ctx;
1143
1144         file->private_data = NULL;
1145
1146         mutex_lock(&ctx->mutex);
1147         mutex_lock(&counter->mutex);
1148
1149         perf_counter_remove_from_context(counter);
1150
1151         mutex_unlock(&counter->mutex);
1152         mutex_unlock(&ctx->mutex);
1153
1154         kfree(counter);
1155         put_context(ctx);
1156
1157         return 0;
1158 }
1159
1160 /*
1161  * Read the performance counter - simple non blocking version for now
1162  */
1163 static ssize_t
1164 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1165 {
1166         u64 cntval;
1167
1168         if (count != sizeof(cntval))
1169                 return -EINVAL;
1170
1171         /*
1172          * Return end-of-file for a read on a counter that is in
1173          * error state (i.e. because it was pinned but it couldn't be
1174          * scheduled on to the CPU at some point).
1175          */
1176         if (counter->state == PERF_COUNTER_STATE_ERROR)
1177                 return 0;
1178
1179         mutex_lock(&counter->mutex);
1180         cntval = perf_counter_read(counter);
1181         mutex_unlock(&counter->mutex);
1182
1183         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1184 }
1185
1186 static ssize_t
1187 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1188 {
1189         if (!usrdata->len)
1190                 return 0;
1191
1192         count = min(count, (size_t)usrdata->len);
1193         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1194                 return -EFAULT;
1195
1196         /* Adjust the counters */
1197         usrdata->len -= count;
1198         if (!usrdata->len)
1199                 usrdata->rd_idx = 0;
1200         else
1201                 usrdata->rd_idx += count;
1202
1203         return count;
1204 }
1205
1206 static ssize_t
1207 perf_read_irq_data(struct perf_counter  *counter,
1208                    char __user          *buf,
1209                    size_t               count,
1210                    int                  nonblocking)
1211 {
1212         struct perf_data *irqdata, *usrdata;
1213         DECLARE_WAITQUEUE(wait, current);
1214         ssize_t res, res2;
1215
1216         irqdata = counter->irqdata;
1217         usrdata = counter->usrdata;
1218
1219         if (usrdata->len + irqdata->len >= count)
1220                 goto read_pending;
1221
1222         if (nonblocking)
1223                 return -EAGAIN;
1224
1225         spin_lock_irq(&counter->waitq.lock);
1226         __add_wait_queue(&counter->waitq, &wait);
1227         for (;;) {
1228                 set_current_state(TASK_INTERRUPTIBLE);
1229                 if (usrdata->len + irqdata->len >= count)
1230                         break;
1231
1232                 if (signal_pending(current))
1233                         break;
1234
1235                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1236                         break;
1237
1238                 spin_unlock_irq(&counter->waitq.lock);
1239                 schedule();
1240                 spin_lock_irq(&counter->waitq.lock);
1241         }
1242         __remove_wait_queue(&counter->waitq, &wait);
1243         __set_current_state(TASK_RUNNING);
1244         spin_unlock_irq(&counter->waitq.lock);
1245
1246         if (usrdata->len + irqdata->len < count &&
1247             counter->state != PERF_COUNTER_STATE_ERROR)
1248                 return -ERESTARTSYS;
1249 read_pending:
1250         mutex_lock(&counter->mutex);
1251
1252         /* Drain pending data first: */
1253         res = perf_copy_usrdata(usrdata, buf, count);
1254         if (res < 0 || res == count)
1255                 goto out;
1256
1257         /* Switch irq buffer: */
1258         usrdata = perf_switch_irq_data(counter);
1259         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1260         if (res2 < 0) {
1261                 if (!res)
1262                         res = -EFAULT;
1263         } else {
1264                 res += res2;
1265         }
1266 out:
1267         mutex_unlock(&counter->mutex);
1268
1269         return res;
1270 }
1271
1272 static ssize_t
1273 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1274 {
1275         struct perf_counter *counter = file->private_data;
1276
1277         switch (counter->hw_event.record_type) {
1278         case PERF_RECORD_SIMPLE:
1279                 return perf_read_hw(counter, buf, count);
1280
1281         case PERF_RECORD_IRQ:
1282         case PERF_RECORD_GROUP:
1283                 return perf_read_irq_data(counter, buf, count,
1284                                           file->f_flags & O_NONBLOCK);
1285         }
1286         return -EINVAL;
1287 }
1288
1289 static unsigned int perf_poll(struct file *file, poll_table *wait)
1290 {
1291         struct perf_counter *counter = file->private_data;
1292         unsigned int events = 0;
1293         unsigned long flags;
1294
1295         poll_wait(file, &counter->waitq, wait);
1296
1297         spin_lock_irqsave(&counter->waitq.lock, flags);
1298         if (counter->usrdata->len || counter->irqdata->len)
1299                 events |= POLLIN;
1300         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1301
1302         return events;
1303 }
1304
1305 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1306 {
1307         struct perf_counter *counter = file->private_data;
1308         int err = 0;
1309
1310         switch (cmd) {
1311         case PERF_COUNTER_IOC_ENABLE:
1312                 perf_counter_enable_family(counter);
1313                 break;
1314         case PERF_COUNTER_IOC_DISABLE:
1315                 perf_counter_disable_family(counter);
1316                 break;
1317         default:
1318                 err = -ENOTTY;
1319         }
1320         return err;
1321 }
1322
1323 static const struct file_operations perf_fops = {
1324         .release                = perf_release,
1325         .read                   = perf_read,
1326         .poll                   = perf_poll,
1327         .unlocked_ioctl         = perf_ioctl,
1328         .compat_ioctl           = perf_ioctl,
1329 };
1330
1331 /*
1332  * Generic software counter infrastructure
1333  */
1334
1335 static void perf_swcounter_update(struct perf_counter *counter)
1336 {
1337         struct hw_perf_counter *hwc = &counter->hw;
1338         u64 prev, now;
1339         s64 delta;
1340
1341 again:
1342         prev = atomic64_read(&hwc->prev_count);
1343         now = atomic64_read(&hwc->count);
1344         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
1345                 goto again;
1346
1347         delta = now - prev;
1348
1349         atomic64_add(delta, &counter->count);
1350         atomic64_sub(delta, &hwc->period_left);
1351 }
1352
1353 static void perf_swcounter_set_period(struct perf_counter *counter)
1354 {
1355         struct hw_perf_counter *hwc = &counter->hw;
1356         s64 left = atomic64_read(&hwc->period_left);
1357         s64 period = hwc->irq_period;
1358
1359         if (unlikely(left <= -period)) {
1360                 left = period;
1361                 atomic64_set(&hwc->period_left, left);
1362         }
1363
1364         if (unlikely(left <= 0)) {
1365                 left += period;
1366                 atomic64_add(period, &hwc->period_left);
1367         }
1368
1369         atomic64_set(&hwc->prev_count, -left);
1370         atomic64_set(&hwc->count, -left);
1371 }
1372
1373 static void perf_swcounter_save_and_restart(struct perf_counter *counter)
1374 {
1375         perf_swcounter_update(counter);
1376         perf_swcounter_set_period(counter);
1377 }
1378
1379 static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
1380 {
1381         struct perf_data *irqdata = counter->irqdata;
1382
1383         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
1384                 irqdata->overrun++;
1385         } else {
1386                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
1387
1388                 *p = data;
1389                 irqdata->len += sizeof(u64);
1390         }
1391 }
1392
1393 static void perf_swcounter_handle_group(struct perf_counter *sibling)
1394 {
1395         struct perf_counter *counter, *group_leader = sibling->group_leader;
1396
1397         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
1398                 perf_swcounter_update(counter);
1399                 perf_swcounter_store_irq(sibling, counter->hw_event.type);
1400                 perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
1401         }
1402 }
1403
1404 static void perf_swcounter_interrupt(struct perf_counter *counter,
1405                                      int nmi, struct pt_regs *regs)
1406 {
1407         perf_swcounter_save_and_restart(counter);
1408
1409         switch (counter->hw_event.record_type) {
1410         case PERF_RECORD_SIMPLE:
1411                 break;
1412
1413         case PERF_RECORD_IRQ:
1414                 perf_swcounter_store_irq(counter, instruction_pointer(regs));
1415                 break;
1416
1417         case PERF_RECORD_GROUP:
1418                 perf_swcounter_handle_group(counter);
1419                 break;
1420         }
1421
1422         if (nmi) {
1423                 counter->wakeup_pending = 1;
1424                 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
1425         } else
1426                 wake_up(&counter->waitq);
1427 }
1428
1429 static int perf_swcounter_match(struct perf_counter *counter,
1430                                 enum hw_event_types event,
1431                                 struct pt_regs *regs)
1432 {
1433         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1434                 return 0;
1435
1436         if (counter->hw_event.raw)
1437                 return 0;
1438
1439         if (counter->hw_event.type != event)
1440                 return 0;
1441
1442         if (counter->hw_event.exclude_user && user_mode(regs))
1443                 return 0;
1444
1445         if (counter->hw_event.exclude_kernel && !user_mode(regs))
1446                 return 0;
1447
1448         return 1;
1449 }
1450
1451 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
1452                                      enum hw_event_types event, u64 nr,
1453                                      int nmi, struct pt_regs *regs)
1454 {
1455         struct perf_counter *counter;
1456         unsigned long flags;
1457         int neg;
1458
1459         if (list_empty(&ctx->counter_list))
1460                 return;
1461
1462         spin_lock_irqsave(&ctx->lock, flags);
1463
1464         /*
1465          * XXX: make counter_list RCU safe
1466          */
1467         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1468                 if (perf_swcounter_match(counter, event, regs)) {
1469                         neg = atomic64_add_negative(nr, &counter->hw.count);
1470                         if (counter->hw.irq_period && !neg)
1471                                 perf_swcounter_interrupt(counter, nmi, regs);
1472                 }
1473         }
1474
1475         spin_unlock_irqrestore(&ctx->lock, flags);
1476 }
1477
1478 void perf_swcounter_event(enum hw_event_types event, u64 nr,
1479                           int nmi, struct pt_regs *regs)
1480 {
1481         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
1482
1483         perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
1484         if (cpuctx->task_ctx)
1485                 perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
1486
1487         put_cpu_var(perf_cpu_context);
1488 }
1489
1490 static void perf_swcounter_read(struct perf_counter *counter)
1491 {
1492         perf_swcounter_update(counter);
1493 }
1494
1495 static int perf_swcounter_enable(struct perf_counter *counter)
1496 {
1497         perf_swcounter_set_period(counter);
1498         return 0;
1499 }
1500
1501 static void perf_swcounter_disable(struct perf_counter *counter)
1502 {
1503         perf_swcounter_update(counter);
1504 }
1505
1506 /*
1507  * Software counter: cpu wall time clock
1508  */
1509
1510 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1511 {
1512         int cpu = raw_smp_processor_id();
1513
1514         atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1515         return 0;
1516 }
1517
1518 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1519 {
1520         int cpu = raw_smp_processor_id();
1521         s64 prev;
1522         u64 now;
1523
1524         now = cpu_clock(cpu);
1525         prev = atomic64_read(&counter->hw.prev_count);
1526         atomic64_set(&counter->hw.prev_count, now);
1527         atomic64_add(now - prev, &counter->count);
1528 }
1529
1530 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1531 {
1532         cpu_clock_perf_counter_update(counter);
1533 }
1534
1535 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1536 {
1537         cpu_clock_perf_counter_update(counter);
1538 }
1539
1540 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1541         .enable         = cpu_clock_perf_counter_enable,
1542         .disable        = cpu_clock_perf_counter_disable,
1543         .read           = cpu_clock_perf_counter_read,
1544 };
1545
1546 /*
1547  * Software counter: task time clock
1548  */
1549
1550 /*
1551  * Called from within the scheduler:
1552  */
1553 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1554 {
1555         struct task_struct *curr = counter->task;
1556         u64 delta;
1557
1558         delta = __task_delta_exec(curr, update);
1559
1560         return curr->se.sum_exec_runtime + delta;
1561 }
1562
1563 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1564 {
1565         u64 prev;
1566         s64 delta;
1567
1568         prev = atomic64_read(&counter->hw.prev_count);
1569
1570         atomic64_set(&counter->hw.prev_count, now);
1571
1572         delta = now - prev;
1573
1574         atomic64_add(delta, &counter->count);
1575 }
1576
1577 static void task_clock_perf_counter_read(struct perf_counter *counter)
1578 {
1579         u64 now = task_clock_perf_counter_val(counter, 1);
1580
1581         task_clock_perf_counter_update(counter, now);
1582 }
1583
1584 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1585 {
1586         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1587                 atomic64_set(&counter->hw.prev_count,
1588                              task_clock_perf_counter_val(counter, 0));
1589
1590         return 0;
1591 }
1592
1593 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1594 {
1595         u64 now = task_clock_perf_counter_val(counter, 0);
1596
1597         task_clock_perf_counter_update(counter, now);
1598 }
1599
1600 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1601         .enable         = task_clock_perf_counter_enable,
1602         .disable        = task_clock_perf_counter_disable,
1603         .read           = task_clock_perf_counter_read,
1604 };
1605
1606 /*
1607  * Software counter: page faults
1608  */
1609
1610 #ifdef CONFIG_VM_EVENT_COUNTERS
1611 #define cpu_page_faults()       __get_cpu_var(vm_event_states).event[PGFAULT]
1612 #else
1613 #define cpu_page_faults()       0
1614 #endif
1615
1616 static u64 get_page_faults(struct perf_counter *counter)
1617 {
1618         struct task_struct *curr = counter->ctx->task;
1619
1620         if (curr)
1621                 return curr->maj_flt + curr->min_flt;
1622         return cpu_page_faults();
1623 }
1624
1625 static void page_faults_perf_counter_update(struct perf_counter *counter)
1626 {
1627         u64 prev, now;
1628         s64 delta;
1629
1630         prev = atomic64_read(&counter->hw.prev_count);
1631         now = get_page_faults(counter);
1632
1633         atomic64_set(&counter->hw.prev_count, now);
1634
1635         delta = now - prev;
1636
1637         atomic64_add(delta, &counter->count);
1638 }
1639
1640 static void page_faults_perf_counter_read(struct perf_counter *counter)
1641 {
1642         page_faults_perf_counter_update(counter);
1643 }
1644
1645 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1646 {
1647         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1648                 atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
1649         return 0;
1650 }
1651
1652 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1653 {
1654         page_faults_perf_counter_update(counter);
1655 }
1656
1657 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1658         .enable         = page_faults_perf_counter_enable,
1659         .disable        = page_faults_perf_counter_disable,
1660         .read           = page_faults_perf_counter_read,
1661 };
1662
1663 /*
1664  * Software counter: context switches
1665  */
1666
1667 static u64 get_context_switches(struct perf_counter *counter)
1668 {
1669         struct task_struct *curr = counter->ctx->task;
1670
1671         if (curr)
1672                 return curr->nvcsw + curr->nivcsw;
1673         return cpu_nr_switches(smp_processor_id());
1674 }
1675
1676 static void context_switches_perf_counter_update(struct perf_counter *counter)
1677 {
1678         u64 prev, now;
1679         s64 delta;
1680
1681         prev = atomic64_read(&counter->hw.prev_count);
1682         now = get_context_switches(counter);
1683
1684         atomic64_set(&counter->hw.prev_count, now);
1685
1686         delta = now - prev;
1687
1688         atomic64_add(delta, &counter->count);
1689 }
1690
1691 static void context_switches_perf_counter_read(struct perf_counter *counter)
1692 {
1693         context_switches_perf_counter_update(counter);
1694 }
1695
1696 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1697 {
1698         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1699                 atomic64_set(&counter->hw.prev_count,
1700                              get_context_switches(counter));
1701         return 0;
1702 }
1703
1704 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1705 {
1706         context_switches_perf_counter_update(counter);
1707 }
1708
1709 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1710         .enable         = context_switches_perf_counter_enable,
1711         .disable        = context_switches_perf_counter_disable,
1712         .read           = context_switches_perf_counter_read,
1713 };
1714
1715 /*
1716  * Software counter: cpu migrations
1717  */
1718
1719 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1720 {
1721         struct task_struct *curr = counter->ctx->task;
1722
1723         if (curr)
1724                 return curr->se.nr_migrations;
1725         return cpu_nr_migrations(smp_processor_id());
1726 }
1727
1728 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1729 {
1730         u64 prev, now;
1731         s64 delta;
1732
1733         prev = atomic64_read(&counter->hw.prev_count);
1734         now = get_cpu_migrations(counter);
1735
1736         atomic64_set(&counter->hw.prev_count, now);
1737
1738         delta = now - prev;
1739
1740         atomic64_add(delta, &counter->count);
1741 }
1742
1743 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1744 {
1745         cpu_migrations_perf_counter_update(counter);
1746 }
1747
1748 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1749 {
1750         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1751                 atomic64_set(&counter->hw.prev_count,
1752                              get_cpu_migrations(counter));
1753         return 0;
1754 }
1755
1756 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1757 {
1758         cpu_migrations_perf_counter_update(counter);
1759 }
1760
1761 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1762         .enable         = cpu_migrations_perf_counter_enable,
1763         .disable        = cpu_migrations_perf_counter_disable,
1764         .read           = cpu_migrations_perf_counter_read,
1765 };
1766
1767 static const struct hw_perf_counter_ops *
1768 sw_perf_counter_init(struct perf_counter *counter)
1769 {
1770         struct perf_counter_hw_event *hw_event = &counter->hw_event;
1771         const struct hw_perf_counter_ops *hw_ops = NULL;
1772         struct hw_perf_counter *hwc = &counter->hw;
1773
1774         /*
1775          * Software counters (currently) can't in general distinguish
1776          * between user, kernel and hypervisor events.
1777          * However, context switches and cpu migrations are considered
1778          * to be kernel events, and page faults are never hypervisor
1779          * events.
1780          */
1781         switch (counter->hw_event.type) {
1782         case PERF_COUNT_CPU_CLOCK:
1783                 if (!(counter->hw_event.exclude_user ||
1784                       counter->hw_event.exclude_kernel ||
1785                       counter->hw_event.exclude_hv))
1786                         hw_ops = &perf_ops_cpu_clock;
1787                 break;
1788         case PERF_COUNT_TASK_CLOCK:
1789                 if (counter->hw_event.exclude_user ||
1790                     counter->hw_event.exclude_kernel ||
1791                     counter->hw_event.exclude_hv)
1792                         break;
1793                 /*
1794                  * If the user instantiates this as a per-cpu counter,
1795                  * use the cpu_clock counter instead.
1796                  */
1797                 if (counter->ctx->task)
1798                         hw_ops = &perf_ops_task_clock;
1799                 else
1800                         hw_ops = &perf_ops_cpu_clock;
1801                 break;
1802         case PERF_COUNT_PAGE_FAULTS:
1803                 if (!(counter->hw_event.exclude_user ||
1804                       counter->hw_event.exclude_kernel))
1805                         hw_ops = &perf_ops_page_faults;
1806                 break;
1807         case PERF_COUNT_CONTEXT_SWITCHES:
1808                 if (!counter->hw_event.exclude_kernel)
1809                         hw_ops = &perf_ops_context_switches;
1810                 break;
1811         case PERF_COUNT_CPU_MIGRATIONS:
1812                 if (!counter->hw_event.exclude_kernel)
1813                         hw_ops = &perf_ops_cpu_migrations;
1814                 break;
1815         default:
1816                 break;
1817         }
1818
1819         if (hw_ops)
1820                 hwc->irq_period = hw_event->irq_period;
1821
1822         return hw_ops;
1823 }
1824
1825 /*
1826  * Allocate and initialize a counter structure
1827  */
1828 static struct perf_counter *
1829 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1830                    int cpu,
1831                    struct perf_counter_context *ctx,
1832                    struct perf_counter *group_leader,
1833                    gfp_t gfpflags)
1834 {
1835         const struct hw_perf_counter_ops *hw_ops;
1836         struct perf_counter *counter;
1837
1838         counter = kzalloc(sizeof(*counter), gfpflags);
1839         if (!counter)
1840                 return NULL;
1841
1842         /*
1843          * Single counters are their own group leaders, with an
1844          * empty sibling list:
1845          */
1846         if (!group_leader)
1847                 group_leader = counter;
1848
1849         mutex_init(&counter->mutex);
1850         INIT_LIST_HEAD(&counter->list_entry);
1851         INIT_LIST_HEAD(&counter->sibling_list);
1852         init_waitqueue_head(&counter->waitq);
1853
1854         INIT_LIST_HEAD(&counter->child_list);
1855
1856         counter->irqdata                = &counter->data[0];
1857         counter->usrdata                = &counter->data[1];
1858         counter->cpu                    = cpu;
1859         counter->hw_event               = *hw_event;
1860         counter->wakeup_pending         = 0;
1861         counter->group_leader           = group_leader;
1862         counter->hw_ops                 = NULL;
1863         counter->ctx                    = ctx;
1864
1865         counter->state = PERF_COUNTER_STATE_INACTIVE;
1866         if (hw_event->disabled)
1867                 counter->state = PERF_COUNTER_STATE_OFF;
1868
1869         hw_ops = NULL;
1870         if (!hw_event->raw && hw_event->type < 0)
1871                 hw_ops = sw_perf_counter_init(counter);
1872         else
1873                 hw_ops = hw_perf_counter_init(counter);
1874
1875         if (!hw_ops) {
1876                 kfree(counter);
1877                 return NULL;
1878         }
1879         counter->hw_ops = hw_ops;
1880
1881         return counter;
1882 }
1883
1884 /**
1885  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
1886  *
1887  * @hw_event_uptr:      event type attributes for monitoring/sampling
1888  * @pid:                target pid
1889  * @cpu:                target cpu
1890  * @group_fd:           group leader counter fd
1891  */
1892 SYSCALL_DEFINE5(perf_counter_open,
1893                 const struct perf_counter_hw_event __user *, hw_event_uptr,
1894                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
1895 {
1896         struct perf_counter *counter, *group_leader;
1897         struct perf_counter_hw_event hw_event;
1898         struct perf_counter_context *ctx;
1899         struct file *counter_file = NULL;
1900         struct file *group_file = NULL;
1901         int fput_needed = 0;
1902         int fput_needed2 = 0;
1903         int ret;
1904
1905         /* for future expandability... */
1906         if (flags)
1907                 return -EINVAL;
1908
1909         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1910                 return -EFAULT;
1911
1912         /*
1913          * Get the target context (task or percpu):
1914          */
1915         ctx = find_get_context(pid, cpu);
1916         if (IS_ERR(ctx))
1917                 return PTR_ERR(ctx);
1918
1919         /*
1920          * Look up the group leader (we will attach this counter to it):
1921          */
1922         group_leader = NULL;
1923         if (group_fd != -1) {
1924                 ret = -EINVAL;
1925                 group_file = fget_light(group_fd, &fput_needed);
1926                 if (!group_file)
1927                         goto err_put_context;
1928                 if (group_file->f_op != &perf_fops)
1929                         goto err_put_context;
1930
1931                 group_leader = group_file->private_data;
1932                 /*
1933                  * Do not allow a recursive hierarchy (this new sibling
1934                  * becoming part of another group-sibling):
1935                  */
1936                 if (group_leader->group_leader != group_leader)
1937                         goto err_put_context;
1938                 /*
1939                  * Do not allow to attach to a group in a different
1940                  * task or CPU context:
1941                  */
1942                 if (group_leader->ctx != ctx)
1943                         goto err_put_context;
1944                 /*
1945                  * Only a group leader can be exclusive or pinned
1946                  */
1947                 if (hw_event.exclusive || hw_event.pinned)
1948                         goto err_put_context;
1949         }
1950
1951         ret = -EINVAL;
1952         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1953                                      GFP_KERNEL);
1954         if (!counter)
1955                 goto err_put_context;
1956
1957         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1958         if (ret < 0)
1959                 goto err_free_put_context;
1960
1961         counter_file = fget_light(ret, &fput_needed2);
1962         if (!counter_file)
1963                 goto err_free_put_context;
1964
1965         counter->filp = counter_file;
1966         mutex_lock(&ctx->mutex);
1967         perf_install_in_context(ctx, counter, cpu);
1968         mutex_unlock(&ctx->mutex);
1969
1970         fput_light(counter_file, fput_needed2);
1971
1972 out_fput:
1973         fput_light(group_file, fput_needed);
1974
1975         return ret;
1976
1977 err_free_put_context:
1978         kfree(counter);
1979
1980 err_put_context:
1981         put_context(ctx);
1982
1983         goto out_fput;
1984 }
1985
1986 /*
1987  * Initialize the perf_counter context in a task_struct:
1988  */
1989 static void
1990 __perf_counter_init_context(struct perf_counter_context *ctx,
1991                             struct task_struct *task)
1992 {
1993         memset(ctx, 0, sizeof(*ctx));
1994         spin_lock_init(&ctx->lock);
1995         mutex_init(&ctx->mutex);
1996         INIT_LIST_HEAD(&ctx->counter_list);
1997         ctx->task = task;
1998 }
1999
2000 /*
2001  * inherit a counter from parent task to child task:
2002  */
2003 static struct perf_counter *
2004 inherit_counter(struct perf_counter *parent_counter,
2005               struct task_struct *parent,
2006               struct perf_counter_context *parent_ctx,
2007               struct task_struct *child,
2008               struct perf_counter *group_leader,
2009               struct perf_counter_context *child_ctx)
2010 {
2011         struct perf_counter *child_counter;
2012
2013         /*
2014          * Instead of creating recursive hierarchies of counters,
2015          * we link inherited counters back to the original parent,
2016          * which has a filp for sure, which we use as the reference
2017          * count:
2018          */
2019         if (parent_counter->parent)
2020                 parent_counter = parent_counter->parent;
2021
2022         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2023                                            parent_counter->cpu, child_ctx,
2024                                            group_leader, GFP_KERNEL);
2025         if (!child_counter)
2026                 return NULL;
2027
2028         /*
2029          * Link it up in the child's context:
2030          */
2031         child_counter->task = child;
2032         list_add_counter(child_counter, child_ctx);
2033         child_ctx->nr_counters++;
2034
2035         child_counter->parent = parent_counter;
2036         /*
2037          * inherit into child's child as well:
2038          */
2039         child_counter->hw_event.inherit = 1;
2040
2041         /*
2042          * Get a reference to the parent filp - we will fput it
2043          * when the child counter exits. This is safe to do because
2044          * we are in the parent and we know that the filp still
2045          * exists and has a nonzero count:
2046          */
2047         atomic_long_inc(&parent_counter->filp->f_count);
2048
2049         /*
2050          * Link this into the parent counter's child list
2051          */
2052         mutex_lock(&parent_counter->mutex);
2053         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2054
2055         /*
2056          * Make the child state follow the state of the parent counter,
2057          * not its hw_event.disabled bit.  We hold the parent's mutex,
2058          * so we won't race with perf_counter_{en,dis}able_family.
2059          */
2060         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2061                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2062         else
2063                 child_counter->state = PERF_COUNTER_STATE_OFF;
2064
2065         mutex_unlock(&parent_counter->mutex);
2066
2067         return child_counter;
2068 }
2069
2070 static int inherit_group(struct perf_counter *parent_counter,
2071               struct task_struct *parent,
2072               struct perf_counter_context *parent_ctx,
2073               struct task_struct *child,
2074               struct perf_counter_context *child_ctx)
2075 {
2076         struct perf_counter *leader;
2077         struct perf_counter *sub;
2078
2079         leader = inherit_counter(parent_counter, parent, parent_ctx,
2080                                  child, NULL, child_ctx);
2081         if (!leader)
2082                 return -ENOMEM;
2083         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2084                 if (!inherit_counter(sub, parent, parent_ctx,
2085                                      child, leader, child_ctx))
2086                         return -ENOMEM;
2087         }
2088         return 0;
2089 }
2090
2091 static void sync_child_counter(struct perf_counter *child_counter,
2092                                struct perf_counter *parent_counter)
2093 {
2094         u64 parent_val, child_val;
2095
2096         parent_val = atomic64_read(&parent_counter->count);
2097         child_val = atomic64_read(&child_counter->count);
2098
2099         /*
2100          * Add back the child's count to the parent's count:
2101          */
2102         atomic64_add(child_val, &parent_counter->count);
2103
2104         /*
2105          * Remove this counter from the parent's list
2106          */
2107         mutex_lock(&parent_counter->mutex);
2108         list_del_init(&child_counter->child_list);
2109         mutex_unlock(&parent_counter->mutex);
2110
2111         /*
2112          * Release the parent counter, if this was the last
2113          * reference to it.
2114          */
2115         fput(parent_counter->filp);
2116 }
2117
2118 static void
2119 __perf_counter_exit_task(struct task_struct *child,
2120                          struct perf_counter *child_counter,
2121                          struct perf_counter_context *child_ctx)
2122 {
2123         struct perf_counter *parent_counter;
2124         struct perf_counter *sub, *tmp;
2125
2126         /*
2127          * If we do not self-reap then we have to wait for the
2128          * child task to unschedule (it will happen for sure),
2129          * so that its counter is at its final count. (This
2130          * condition triggers rarely - child tasks usually get
2131          * off their CPU before the parent has a chance to
2132          * get this far into the reaping action)
2133          */
2134         if (child != current) {
2135                 wait_task_inactive(child, 0);
2136                 list_del_init(&child_counter->list_entry);
2137         } else {
2138                 struct perf_cpu_context *cpuctx;
2139                 unsigned long flags;
2140                 u64 perf_flags;
2141
2142                 /*
2143                  * Disable and unlink this counter.
2144                  *
2145                  * Be careful about zapping the list - IRQ/NMI context
2146                  * could still be processing it:
2147                  */
2148                 curr_rq_lock_irq_save(&flags);
2149                 perf_flags = hw_perf_save_disable();
2150
2151                 cpuctx = &__get_cpu_var(perf_cpu_context);
2152
2153                 group_sched_out(child_counter, cpuctx, child_ctx);
2154
2155                 list_del_init(&child_counter->list_entry);
2156
2157                 child_ctx->nr_counters--;
2158
2159                 hw_perf_restore(perf_flags);
2160                 curr_rq_unlock_irq_restore(&flags);
2161         }
2162
2163         parent_counter = child_counter->parent;
2164         /*
2165          * It can happen that parent exits first, and has counters
2166          * that are still around due to the child reference. These
2167          * counters need to be zapped - but otherwise linger.
2168          */
2169         if (parent_counter) {
2170                 sync_child_counter(child_counter, parent_counter);
2171                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2172                                          list_entry) {
2173                         if (sub->parent) {
2174                                 sync_child_counter(sub, sub->parent);
2175                                 kfree(sub);
2176                         }
2177                 }
2178                 kfree(child_counter);
2179         }
2180 }
2181
2182 /*
2183  * When a child task exits, feed back counter values to parent counters.
2184  *
2185  * Note: we may be running in child context, but the PID is not hashed
2186  * anymore so new counters will not be added.
2187  */
2188 void perf_counter_exit_task(struct task_struct *child)
2189 {
2190         struct perf_counter *child_counter, *tmp;
2191         struct perf_counter_context *child_ctx;
2192
2193         child_ctx = &child->perf_counter_ctx;
2194
2195         if (likely(!child_ctx->nr_counters))
2196                 return;
2197
2198         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2199                                  list_entry)
2200                 __perf_counter_exit_task(child, child_counter, child_ctx);
2201 }
2202
2203 /*
2204  * Initialize the perf_counter context in task_struct
2205  */
2206 void perf_counter_init_task(struct task_struct *child)
2207 {
2208         struct perf_counter_context *child_ctx, *parent_ctx;
2209         struct perf_counter *counter;
2210         struct task_struct *parent = current;
2211
2212         child_ctx  =  &child->perf_counter_ctx;
2213         parent_ctx = &parent->perf_counter_ctx;
2214
2215         __perf_counter_init_context(child_ctx, child);
2216
2217         /*
2218          * This is executed from the parent task context, so inherit
2219          * counters that have been marked for cloning:
2220          */
2221
2222         if (likely(!parent_ctx->nr_counters))
2223                 return;
2224
2225         /*
2226          * Lock the parent list. No need to lock the child - not PID
2227          * hashed yet and not running, so nobody can access it.
2228          */
2229         mutex_lock(&parent_ctx->mutex);
2230
2231         /*
2232          * We dont have to disable NMIs - we are only looking at
2233          * the list, not manipulating it:
2234          */
2235         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2236                 if (!counter->hw_event.inherit)
2237                         continue;
2238
2239                 if (inherit_group(counter, parent,
2240                                   parent_ctx, child, child_ctx))
2241                         break;
2242         }
2243
2244         mutex_unlock(&parent_ctx->mutex);
2245 }
2246
2247 static void __cpuinit perf_counter_init_cpu(int cpu)
2248 {
2249         struct perf_cpu_context *cpuctx;
2250
2251         cpuctx = &per_cpu(perf_cpu_context, cpu);
2252         __perf_counter_init_context(&cpuctx->ctx, NULL);
2253
2254         mutex_lock(&perf_resource_mutex);
2255         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2256         mutex_unlock(&perf_resource_mutex);
2257
2258         hw_perf_counter_setup(cpu);
2259 }
2260
2261 #ifdef CONFIG_HOTPLUG_CPU
2262 static void __perf_counter_exit_cpu(void *info)
2263 {
2264         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2265         struct perf_counter_context *ctx = &cpuctx->ctx;
2266         struct perf_counter *counter, *tmp;
2267
2268         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2269                 __perf_counter_remove_from_context(counter);
2270 }
2271 static void perf_counter_exit_cpu(int cpu)
2272 {
2273         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2274         struct perf_counter_context *ctx = &cpuctx->ctx;
2275
2276         mutex_lock(&ctx->mutex);
2277         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2278         mutex_unlock(&ctx->mutex);
2279 }
2280 #else
2281 static inline void perf_counter_exit_cpu(int cpu) { }
2282 #endif
2283
2284 static int __cpuinit
2285 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2286 {
2287         unsigned int cpu = (long)hcpu;
2288
2289         switch (action) {
2290
2291         case CPU_UP_PREPARE:
2292         case CPU_UP_PREPARE_FROZEN:
2293                 perf_counter_init_cpu(cpu);
2294                 break;
2295
2296         case CPU_DOWN_PREPARE:
2297         case CPU_DOWN_PREPARE_FROZEN:
2298                 perf_counter_exit_cpu(cpu);
2299                 break;
2300
2301         default:
2302                 break;
2303         }
2304
2305         return NOTIFY_OK;
2306 }
2307
2308 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2309         .notifier_call          = perf_cpu_notify,
2310 };
2311
2312 static int __init perf_counter_init(void)
2313 {
2314         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2315                         (void *)(long)smp_processor_id());
2316         register_cpu_notifier(&perf_cpu_nb);
2317
2318         return 0;
2319 }
2320 early_initcall(perf_counter_init);
2321
2322 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2323 {
2324         return sprintf(buf, "%d\n", perf_reserved_percpu);
2325 }
2326
2327 static ssize_t
2328 perf_set_reserve_percpu(struct sysdev_class *class,
2329                         const char *buf,
2330                         size_t count)
2331 {
2332         struct perf_cpu_context *cpuctx;
2333         unsigned long val;
2334         int err, cpu, mpt;
2335
2336         err = strict_strtoul(buf, 10, &val);
2337         if (err)
2338                 return err;
2339         if (val > perf_max_counters)
2340                 return -EINVAL;
2341
2342         mutex_lock(&perf_resource_mutex);
2343         perf_reserved_percpu = val;
2344         for_each_online_cpu(cpu) {
2345                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2346                 spin_lock_irq(&cpuctx->ctx.lock);
2347                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2348                           perf_max_counters - perf_reserved_percpu);
2349                 cpuctx->max_pertask = mpt;
2350                 spin_unlock_irq(&cpuctx->ctx.lock);
2351         }
2352         mutex_unlock(&perf_resource_mutex);
2353
2354         return count;
2355 }
2356
2357 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2358 {
2359         return sprintf(buf, "%d\n", perf_overcommit);
2360 }
2361
2362 static ssize_t
2363 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2364 {
2365         unsigned long val;
2366         int err;
2367
2368         err = strict_strtoul(buf, 10, &val);
2369         if (err)
2370                 return err;
2371         if (val > 1)
2372                 return -EINVAL;
2373
2374         mutex_lock(&perf_resource_mutex);
2375         perf_overcommit = val;
2376         mutex_unlock(&perf_resource_mutex);
2377
2378         return count;
2379 }
2380
2381 static SYSDEV_CLASS_ATTR(
2382                                 reserve_percpu,
2383                                 0644,
2384                                 perf_show_reserve_percpu,
2385                                 perf_set_reserve_percpu
2386                         );
2387
2388 static SYSDEV_CLASS_ATTR(
2389                                 overcommit,
2390                                 0644,
2391                                 perf_show_overcommit,
2392                                 perf_set_overcommit
2393                         );
2394
2395 static struct attribute *perfclass_attrs[] = {
2396         &attr_reserve_percpu.attr,
2397         &attr_overcommit.attr,
2398         NULL
2399 };
2400
2401 static struct attribute_group perfclass_attr_group = {
2402         .attrs                  = perfclass_attrs,
2403         .name                   = "perf_counters",
2404 };
2405
2406 static int __init perf_counter_sysfs_init(void)
2407 {
2408         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2409                                   &perfclass_attr_group);
2410 }
2411 device_initcall(perf_counter_sysfs_init);