static atomic_t nr_mmap_counters __read_mostly;
static atomic_t nr_comm_counters __read_mostly;
-int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+/*
+ * perf counter paranoia level:
+ * 0 - not paranoid
+ * 1 - disallow cpu counters to unpriv
+ * 2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly;
+
+static inline bool perf_paranoid_cpu(void)
+{
+ return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+ return sysctl_perf_counter_paranoid > 1;
+}
+
int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
static atomic64_t perf_counter_id;
int do_switch = 1;
regs = task_pt_regs(task);
- perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+ perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
if (likely(!ctx || !cpuctx->task_ctx))
return;
if (interrupts == MAX_INTERRUPTS) {
perf_log_throttle(counter, 1);
counter->pmu->unthrottle(counter);
- interrupts = 2*sysctl_perf_counter_limit/HZ;
+ interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
}
if (!counter->attr.freq || !counter->attr.sample_freq)
*/
if (cpu != -1) {
/* Must be root to operate on a CPU counter: */
- if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
if (cpu < 0 || cpu > num_possible_cpus())
spin_lock_irq(&ctx->lock);
if (counter->attr.freq) {
- if (value > sysctl_perf_counter_limit) {
+ if (value > sysctl_perf_counter_sample_rate) {
ret = -EINVAL;
goto unlock;
}
return task_pid_nr_ns(p, counter->ns);
}
-static void perf_counter_output(struct perf_counter *counter,
- int nmi, struct pt_regs *regs, u64 addr)
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
{
int ret;
u64 sample_type = counter->attr.sample_type;
header.size = sizeof(header);
header.misc = PERF_EVENT_MISC_OVERFLOW;
- header.misc |= perf_misc_flags(regs);
+ header.misc |= perf_misc_flags(data->regs);
if (sample_type & PERF_SAMPLE_IP) {
- ip = perf_instruction_pointer(regs);
+ ip = perf_instruction_pointer(data->regs);
header.type |= PERF_SAMPLE_IP;
header.size += sizeof(ip);
}
}
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- callchain = perf_callchain(regs);
+ callchain = perf_callchain(data->regs);
if (callchain) {
callchain_size = (1 + callchain->nr) * sizeof(u64);
perf_output_put(&handle, time);
if (sample_type & PERF_SAMPLE_ADDR)
- perf_output_put(&handle, addr);
+ perf_output_put(&handle, data->addr);
if (sample_type & PERF_SAMPLE_ID)
perf_output_put(&handle, counter->id);
perf_output_put(&handle, cpu_entry);
if (sample_type & PERF_SAMPLE_PERIOD)
- perf_output_put(&handle, counter->hw.sample_period);
+ perf_output_put(&handle, data->period);
/*
* XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
struct {
struct perf_event_header header;
u64 time;
+ u64 id;
} throttle_event = {
.header = {
.type = PERF_EVENT_THROTTLE + 1,
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = sched_clock(),
+ .time = sched_clock(),
+ .id = counter->id,
};
ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
* Generic counter overflow handling.
*/
-int perf_counter_overflow(struct perf_counter *counter,
- int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
{
int events = atomic_read(&counter->event_limit);
int throttle = counter->pmu->unthrottle != NULL;
} else {
if (hwc->interrupts != MAX_INTERRUPTS) {
hwc->interrupts++;
- if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+ if (HZ * hwc->interrupts >
+ (u64)sysctl_perf_counter_sample_rate) {
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(counter, 0);
ret = 1;
perf_counter_disable(counter);
}
- perf_counter_output(counter, nmi, regs, addr);
+ perf_counter_output(counter, nmi, data);
return ret;
}
if (unlikely(left <= -period)) {
left = period;
atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
}
if (unlikely(left <= 0)) {
left += period;
atomic64_add(period, &hwc->period_left);
+ hwc->last_period = period;
}
atomic64_set(&hwc->prev_count, -left);
static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
struct perf_counter *counter;
- struct pt_regs *regs;
u64 period;
counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
counter->pmu->read(counter);
- regs = get_irq_regs();
+ data.addr = 0;
+ data.regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
- if ((counter->attr.exclude_kernel || !regs) &&
+ if ((counter->attr.exclude_kernel || !data.regs) &&
!counter->attr.exclude_user)
- regs = task_pt_regs(current);
+ data.regs = task_pt_regs(current);
- if (regs) {
- if (perf_counter_overflow(counter, 0, regs, 0))
+ if (data.regs) {
+ if (perf_counter_overflow(counter, 0, &data))
ret = HRTIMER_NORESTART;
}
static void perf_swcounter_overflow(struct perf_counter *counter,
int nmi, struct pt_regs *regs, u64 addr)
{
+ struct perf_sample_data data = {
+ .regs = regs,
+ .addr = addr,
+ .period = counter->hw.last_period,
+ };
+
perf_swcounter_update(counter);
perf_swcounter_set_period(counter);
- if (perf_counter_overflow(counter, nmi, regs, addr))
+ if (perf_counter_overflow(counter, nmi, &data))
/* soft-disable the counter */
;
}
static int perf_swcounter_match(struct perf_counter *counter,
- enum perf_event_types type,
+ enum perf_type_id type,
u32 event, struct pt_regs *regs)
{
if (!perf_swcounter_is_counting(counter))
}
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
- enum perf_event_types type, u32 event,
+ enum perf_type_id type, u32 event,
u64 nr, int nmi, struct pt_regs *regs,
u64 addr)
{
return &cpuctx->recursion[0];
}
-static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
u64 nr, int nmi, struct pt_regs *regs,
u64 addr)
{
struct perf_counter_context *ctx;
perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
- PERF_COUNT_CPU_MIGRATIONS,
+ PERF_COUNT_SW_CPU_MIGRATIONS,
1, 1, NULL, 0);
ctx = perf_pin_task_context(task);
if (ctx) {
perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
- PERF_COUNT_CPU_MIGRATIONS,
+ PERF_COUNT_SW_CPU_MIGRATIONS,
1, 1, NULL, 0);
perf_unpin_context(ctx);
}
* events.
*/
switch (counter->attr.config) {
- case PERF_COUNT_CPU_CLOCK:
+ case PERF_COUNT_SW_CPU_CLOCK:
pmu = &perf_ops_cpu_clock;
break;
- case PERF_COUNT_TASK_CLOCK:
+ case PERF_COUNT_SW_TASK_CLOCK:
/*
* If the user instantiates this as a per-cpu counter,
* use the cpu_clock counter instead.
pmu = &perf_ops_cpu_clock;
break;
- case PERF_COUNT_PAGE_FAULTS:
- case PERF_COUNT_PAGE_FAULTS_MIN:
- case PERF_COUNT_PAGE_FAULTS_MAJ:
- case PERF_COUNT_CONTEXT_SWITCHES:
- case PERF_COUNT_CPU_MIGRATIONS:
+ case PERF_COUNT_SW_PAGE_FAULTS:
+ case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+ case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+ case PERF_COUNT_SW_CONTEXT_SWITCHES:
+ case PERF_COUNT_SW_CPU_MIGRATIONS:
pmu = &perf_ops_generic;
break;
}
if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
goto done;
- if (attr->type == PERF_TYPE_RAW) {
- pmu = hw_perf_counter_init(counter);
- goto done;
- }
-
switch (attr->type) {
+ case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
pmu = hw_perf_counter_init(counter);
case PERF_TYPE_TRACEPOINT:
pmu = tp_perf_counter_init(counter);
break;
+
+ default:
+ break;
}
done:
err = 0;
return counter;
}
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+ struct perf_counter_attr *attr)
+{
+ int ret;
+ u32 size;
+
+ if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size < PERF_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned long val;
+ unsigned long __user *addr;
+ unsigned long __user *end;
+
+ addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+ sizeof(unsigned long));
+ end = PTR_ALIGN((void __user *)uattr + size,
+ sizeof(unsigned long));
+
+ for (; addr < end; addr += sizeof(unsigned long)) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * If the type exists, the corresponding creation will verify
+ * the attr->config.
+ */
+ if (attr->type >= PERF_TYPE_MAX)
+ return -EINVAL;
+
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+ return -EINVAL;
+
+ if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+ return -EINVAL;
+
+ if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+ return -EINVAL;
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
/**
* sys_perf_counter_open - open a performance counter, associate it to a task/cpu
*
* @group_fd: group leader counter fd
*/
SYSCALL_DEFINE5(perf_counter_open,
- const struct perf_counter_attr __user *, attr_uptr,
+ struct perf_counter_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_counter *counter, *group_leader;
if (flags)
return -EINVAL;
- if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
- return -EFAULT;
+ ret = perf_copy_attr(attr_uptr, &attr);
+ if (ret)
+ return ret;
+
+ if (!attr.exclude_kernel) {
+ if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr.freq) {
+ if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+ return -EINVAL;
+ }
/*
* Get the target context (task or percpu):