X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=kernel%2Fperf_counter.c;h=ef5d8a5b245390130f76e9284f372396dfa7ddd5;hb=92636b152f3b58e459988934f689619af9e04dbc;hp=df319c48c52b9d9bc3572d68462cf9716fd2e884;hpb=22a4f650d686eeaac3629dae1c4294381485efdf;p=platform%2Fkernel%2Flinux-starfive.git

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index df319c4..ef5d8a5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -40,13 +40,35 @@ static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_tracking __read_mostly;
-static atomic_t nr_munmap_tracking __read_mostly;
-static atomic_t nr_comm_tracking __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
+
+/*
+ * perf counter paranoia level:
+ *  0 - not paranoid
+ *  1 - disallow cpu counters to unpriv
+ *  2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly;
+
+static inline bool perf_paranoid_cpu(void)
+{
+	return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+	return sysctl_perf_counter_paranoid > 1;
+}
 
-int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_counter_id;
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -260,7 +282,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu--;
 	ctx->nr_active--;
-	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+	if (counter->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 }
 
@@ -282,7 +304,7 @@ group_sched_out(struct perf_counter *group_counter,
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
 
-	if (group_counter->hw_event.exclusive)
+	if (group_counter->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -550,7 +572,7 @@ counter_sched_in(struct perf_counter *counter,
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
-	if (counter->hw_event.exclusive)
+	if (counter->attr.exclusive)
 		cpuctx->exclusive = 1;
 
 	return 0;
@@ -572,7 +594,6 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
-	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -580,7 +601,6 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -644,7 +664,7 @@ static int group_can_go_on(struct perf_counter *counter,
 	 * If this group is exclusive and there are already
 	 * counters on the CPU, it can't go on.
 	 */
-	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+	if (counter->attr.exclusive && cpuctx->active_oncpu)
 		return 0;
 	/*
 	 * Otherwise, try to add it if all previous groups were able
@@ -657,7 +677,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 			       struct perf_counter_context *ctx)
 {
 	list_add_counter(counter, ctx);
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
 	counter->tstamp_enabled = ctx->time;
 	counter->tstamp_running = ctx->time;
 	counter->tstamp_stopped = ctx->time;
@@ -728,7 +747,7 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -820,7 +839,6 @@ static void __perf_counter_enable(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
@@ -853,7 +871,7 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -931,7 +949,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	/*
 	 * not supported on inherited counters
 	 */
-	if (counter->hw_event.inherit)
+	if (counter->attr.inherit)
 		return -EINVAL;
 
 	atomic_add(refresh, &counter->event_limit);
@@ -1006,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	int do_switch = 1;
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
 
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
@@ -1098,7 +1116,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->hw_event.pinned)
+		    !counter->attr.pinned)
 			continue;
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
@@ -1126,7 +1144,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * ignore pinned counters since we did them already.
 		 */
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->hw_event.pinned)
+		    counter->attr.pinned)
 			continue;
 
 		/*
@@ -1187,44 +1205,88 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
-static void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
 {
-	struct perf_counter *counter;
-	u64 interrupts, irq_period;
-	u64 events, period;
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 period, sample_period;
 	s64 delta;
 
+	events *= hwc->sample_period;
+	period = div64_u64(events, counter->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	perf_log_period(counter, sample_period);
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
+	u64 interrupts, freq;
+
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
-		interrupts = counter->hw.interrupts;
-		counter->hw.interrupts = 0;
+		hwc = &counter->hw;
+
+		interrupts = hwc->interrupts;
+		hwc->interrupts = 0;
 
+		/*
+		 * unthrottle counters on the tick
+		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
-			interrupts = 2*sysctl_perf_counter_limit/HZ;
+			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.irq_period;
-		period = div64_u64(events, counter->hw_event.irq_freq);
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
+		if (counter->attr.sample_freq < HZ) {
+			freq = counter->attr.sample_freq;
 
-		delta = (s64)(1 + period - counter->hw.irq_period);
-		delta >>= 1;
+			hwc->freq_count += freq;
+			hwc->freq_interrupts += interrupts;
 
-		irq_period = counter->hw.irq_period + delta;
+			if (hwc->freq_count < HZ)
+				continue;
 
-		if (!irq_period)
-			irq_period = 1;
+			interrupts = hwc->freq_interrupts;
+			hwc->freq_interrupts = 0;
+			hwc->freq_count -= HZ;
+		} else
+			freq = HZ;
 
-		perf_log_period(counter, irq_period);
+		perf_adjust_period(counter, freq * interrupts);
 
-		counter->hw.irq_period = irq_period;
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			counter->pmu->disable(counter);
+			atomic_set(&hwc->period_left, 0);
+			counter->pmu->enable(counter);
+			perf_enable();
+		}
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -1264,9 +1326,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = curr->perf_counter_ctxp;
 
-	perf_adjust_freq(&cpuctx->ctx);
+	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
-		perf_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
 	if (ctx)
@@ -1344,7 +1406,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
@@ -1436,6 +1498,8 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	if (counter->ns)
+		put_pid_ns(counter->ns);
 	kfree(counter);
 }
 
@@ -1446,12 +1510,10 @@ static void free_counter(struct perf_counter *counter)
 	perf_pending_sync(counter);
 
 	atomic_dec(&nr_counters);
-	if (counter->hw_event.mmap)
-		atomic_dec(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
-		atomic_dec(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
-		atomic_dec(&nr_comm_tracking);
+	if (counter->attr.mmap)
+		atomic_dec(&nr_mmap_counters);
+	if (counter->attr.comm)
+		atomic_dec(&nr_comm_counters);
 
 	if (counter->destroy)
 		counter->destroy(counter);
@@ -1506,12 +1568,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = counter->total_time_enabled +
 			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
+	if (counter->attr.read_format & PERF_FORMAT_ID)
+		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
@@ -1604,6 +1668,43 @@ static void perf_counter_for_each(struct perf_counter *counter,
 	mutex_unlock(&counter->child_mutex);
 }
 
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long size;
+	int ret = 0;
+	u64 value;
+
+	if (!counter->attr.sample_period)
+		return -EINVAL;
+
+	size = copy_from_user(&value, arg, sizeof(value));
+	if (size != sizeof(value))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->attr.freq) {
+		if (value > sysctl_perf_counter_sample_rate) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		counter->attr.sample_freq = value;
+	} else {
+		perf_log_period(counter, value);
+
+		counter->attr.sample_period = value;
+		counter->hw.sample_period = value;
+	}
+unlock:
+	spin_unlock_irq(&ctx->lock);
+
+	return ret;
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1623,6 +1724,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case PERF_COUNTER_IOC_REFRESH:
 		return perf_counter_refresh(counter, arg);
+
+	case PERF_COUNTER_IOC_PERIOD:
+		return perf_counter_period(counter, (u64 __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -2067,8 +2172,8 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 struct perf_output_handle {
 	struct perf_counter	*counter;
 	struct perf_mmap_data	*data;
-	unsigned int		offset;
-	unsigned int		head;
+	unsigned long		head;
+	unsigned long		offset;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -2122,7 +2227,8 @@ static void perf_output_lock(struct perf_output_handle *handle)
 static void perf_output_unlock(struct perf_output_handle *handle)
 {
 	struct perf_mmap_data *data = handle->data;
-	int head, cpu;
+	unsigned long head;
+	int cpu;
 
 	data->done_head = data->head;
 
@@ -2135,7 +2241,7 @@ again:
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->done_head, 0)))
+	while ((head = atomic_long_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
 
 	/*
@@ -2148,7 +2254,7 @@ again:
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->done_head))) {
+	if (unlikely(atomic_long_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -2193,9 +2299,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_lock(handle);
 
 	do {
-		offset = head = atomic_read(&data->head);
+		offset = head = atomic_long_read(&data->head);
 		head += size;
-	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
 	handle->head	= head;
@@ -2214,7 +2320,7 @@ out:
 }
 
 static void perf_output_copy(struct perf_output_handle *handle,
-			     void *buf, unsigned int len)
+			     const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2246,7 +2352,7 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	 * Check we didn't copy past our reservation window, taking the
 	 * possible unsigned int wrap into account.
 	 */
-	WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 #define perf_output_put(handle, x) \
@@ -2257,7 +2363,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
 
-	int wakeup_events = counter->hw_event.wakeup_events;
+	int wakeup_events = counter->attr.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
@@ -2271,11 +2377,33 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs, u64 addr)
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_pid_nr_ns(p, counter->ns);
+}
+
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data)
 {
 	int ret;
-	u64 record_type = counter->hw_event.record_type;
+	u64 sample_type = counter->attr.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2283,7 +2411,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		u32 pid, tid;
 	} tid_entry;
 	struct {
-		u64 event;
+		u64 id;
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
@@ -2297,63 +2425,68 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= perf_misc_flags(regs);
+	header.misc |= perf_misc_flags(data->regs);
 
-	if (record_type & PERF_RECORD_IP) {
-		ip = perf_instruction_pointer(regs);
-		header.type |= PERF_RECORD_IP;
+	if (sample_type & PERF_SAMPLE_IP) {
+		ip = perf_instruction_pointer(data->regs);
+		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
-	if (record_type & PERF_RECORD_TID) {
+	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
-		tid_entry.pid = current->group_leader->pid;
-		tid_entry.tid = current->pid;
+		tid_entry.pid = perf_counter_pid(counter, current);
+		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_RECORD_TID;
+		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
+	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_RECORD_TIME;
+		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_ADDR) {
-		header.type |= PERF_RECORD_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR) {
+		header.type |= PERF_SAMPLE_ADDR;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CONFIG) {
-		header.type |= PERF_RECORD_CONFIG;
+	if (sample_type & PERF_SAMPLE_ID) {
+		header.type |= PERF_SAMPLE_ID;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CPU) {
-		header.type |= PERF_RECORD_CPU;
+	if (sample_type & PERF_SAMPLE_CPU) {
+		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= PERF_RECORD_GROUP;
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		header.type |= PERF_SAMPLE_PERIOD;
+		header.size += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_GROUP) {
+		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
 
-	if (record_type & PERF_RECORD_CALLCHAIN) {
-		callchain = perf_callchain(regs);
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		callchain = perf_callchain(data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= PERF_RECORD_CALLCHAIN;
+			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -2364,28 +2497,31 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	perf_output_put(&handle, header);
 
-	if (record_type & PERF_RECORD_IP)
+	if (sample_type & PERF_SAMPLE_IP)
 		perf_output_put(&handle, ip);
 
-	if (record_type & PERF_RECORD_TID)
+	if (sample_type & PERF_SAMPLE_TID)
 		perf_output_put(&handle, tid_entry);
 
-	if (record_type & PERF_RECORD_TIME)
+	if (sample_type & PERF_SAMPLE_TIME)
 		perf_output_put(&handle, time);
 
-	if (record_type & PERF_RECORD_ADDR)
-		perf_output_put(&handle, addr);
+	if (sample_type & PERF_SAMPLE_ADDR)
+		perf_output_put(&handle, data->addr);
 
-	if (record_type & PERF_RECORD_CONFIG)
-		perf_output_put(&handle, counter->hw_event.config);
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(&handle, counter->id);
 
-	if (record_type & PERF_RECORD_CPU)
+	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(&handle, data->period);
+
 	/*
-	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
-	if (record_type & PERF_RECORD_GROUP) {
+	if (sample_type & PERF_SAMPLE_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
 
@@ -2396,7 +2532,7 @@ static void perf_counter_output(struct perf_counter *counter,
 			if (sub != counter)
 				sub->pmu->read(sub);
 
-			group_entry.event = sub->hw_event.config;
+			group_entry.id = sub->id;
 			group_entry.counter = atomic64_read(&sub->count);
 
 			perf_output_put(&handle, group_entry);
@@ -2410,6 +2546,104 @@ static void perf_counter_output(struct perf_counter *counter,
 }
 
 /*
+ * fork tracking
+ */
+
+struct perf_fork_event {
+	struct task_struct	*task;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				ppid;
+	} event;
+};
+
+static void perf_counter_fork_output(struct perf_counter *counter,
+				     struct perf_fork_event *fork_event)
+{
+	struct perf_output_handle handle;
+	int size = fork_event->event.header.size;
+	struct task_struct *task = fork_event->task;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	fork_event->event.pid = perf_counter_pid(counter, task);
+	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+
+	perf_output_put(&handle, fork_event->event);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_fork_match(struct perf_counter *counter)
+{
+	if (counter->attr.comm || counter->attr.mmap)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
+				  struct perf_fork_event *fork_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_fork_match(counter))
+			perf_counter_fork_output(counter, fork_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_fork_ctx(ctx, fork_event);
+	rcu_read_unlock();
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+	struct perf_fork_event fork_event;
+
+	if (!atomic_read(&nr_comm_counters) &&
+	    !atomic_read(&nr_mmap_counters))
+		return;
+
+	fork_event = (struct perf_fork_event){
+		.task	= task,
+		.event  = {
+			.header = {
+				.type = PERF_EVENT_FORK,
+				.size = sizeof(fork_event.event),
+			},
+		},
+	};
+
+	perf_counter_fork_event(&fork_event);
+}
+
+/*
  * comm tracking
  */
 
@@ -2436,17 +2670,18 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
 	perf_output_put(&handle, comm_event->event);
 	perf_output_copy(&handle, comm_event->comm,
 				   comm_event->comm_size);
 	perf_output_end(&handle);
 }
 
-static int perf_counter_comm_match(struct perf_counter *counter,
-				   struct perf_comm_event *comm_event)
+static int perf_counter_comm_match(struct perf_counter *counter)
 {
-	if (counter->hw_event.comm &&
-	    comm_event->event.header.type == PERF_EVENT_COMM)
+	if (counter->attr.comm)
 		return 1;
 
 	return 0;
@@ -2462,7 +2697,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_comm_match(counter, comm_event))
+		if (perf_counter_comm_match(counter))
 			perf_counter_comm_output(counter, comm_event);
 	}
 	rcu_read_unlock();
@@ -2501,15 +2736,13 @@ void perf_counter_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
 
-	if (!atomic_read(&nr_comm_tracking))
+	if (!atomic_read(&nr_comm_counters))
 		return;
 
 	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
 			.header = { .type = PERF_EVENT_COMM, },
-			.pid	= task->group_leader->pid,
-			.tid	= task->pid,
 		},
 	};
 
@@ -2521,9 +2754,10 @@ void perf_counter_comm(struct task_struct *task)
  */
 
 struct perf_mmap_event {
-	struct file	*file;
-	char		*file_name;
-	int		file_size;
+	struct vm_area_struct	*vma;
+
+	const char		*file_name;
+	int			file_size;
 
 	struct {
 		struct perf_event_header	header;
@@ -2546,6 +2780,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	mmap_event->event.pid = perf_counter_pid(counter, current);
+	mmap_event->event.tid = perf_counter_tid(counter, current);
+
 	perf_output_put(&handle, mmap_event->event);
 	perf_output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
@@ -2555,12 +2792,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->hw_event.mmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MMAP)
-		return 1;
-
-	if (counter->hw_event.munmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+	if (counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2586,11 +2818,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct file *file = mmap_event->file;
+	struct vm_area_struct *vma = mmap_event->vma;
+	struct file *file = vma->vm_file;
 	unsigned int size;
 	char tmp[16];
 	char *buf = NULL;
-	char *name;
+	const char *name;
 
 	if (file) {
 		buf = kzalloc(PATH_MAX, GFP_KERNEL);
@@ -2604,6 +2837,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			goto got_name;
 		}
 	} else {
+		name = arch_vma_name(mmap_event->vma);
+		if (name)
+			goto got_name;
+
+		if (!vma->vm_mm) {
+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
+			goto got_name;
+		}
+
 		name = strncpy(tmp, "//anon", sizeof(tmp));
 		goto got_name;
 	}
@@ -2633,46 +2875,20 @@ got_name:
 	kfree(buf);
 }
 
-void perf_counter_mmap(unsigned long addr, unsigned long len,
-		       unsigned long pgoff, struct file *file)
+void __perf_counter_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_mmap_tracking))
+	if (!atomic_read(&nr_mmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
-		.file   = file,
+		.vma	= vma,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
-		},
-	};
-
-	perf_counter_mmap_event(&mmap_event);
-}
-
-void perf_counter_munmap(unsigned long addr, unsigned long len,
-			 unsigned long pgoff, struct file *file)
-{
-	struct perf_mmap_event mmap_event;
-
-	if (!atomic_read(&nr_munmap_tracking))
-		return;
-
-	mmap_event = (struct perf_mmap_event){
-		.file   = file,
-		.event  = {
-			.header = { .type = PERF_EVENT_MUNMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
+			.start  = vma->vm_start,
+			.len    = vma->vm_end - vma->vm_start,
+			.pgoff  = vma->vm_pgoff,
 		},
 	};
 
@@ -2680,37 +2896,45 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- * Log irq_period changes so that analyzing tools can re-normalize the
+ * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
  */
 
+struct freq_event {
+	struct perf_event_header	header;
+	u64				time;
+	u64				id;
+	u64				period;
+};
+
 static void perf_log_period(struct perf_counter *counter, u64 period)
 {
 	struct perf_output_handle handle;
+	struct freq_event event;
 	int ret;
 
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				period;
-	} freq_event = {
+	if (counter->hw.sample_period == period)
+		return;
+
+	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+		return;
+
+	event = (struct freq_event) {
 		.header = {
 			.type = PERF_EVENT_PERIOD,
 			.misc = 0,
-			.size = sizeof(freq_event),
+			.size = sizeof(event),
 		},
 		.time = sched_clock(),
+		.id = counter->id,
 		.period = period,
 	};
 
-	if (counter->hw.irq_period == period)
-		return;
-
-	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
 	if (ret)
 		return;
 
-	perf_output_put(&handle, freq_event);
+	perf_output_put(&handle, event);
 	perf_output_end(&handle);
 }
 
@@ -2726,13 +2950,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 	} throttle_event = {
 		.header = {
 			.type = PERF_EVENT_THROTTLE + 1,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time = sched_clock(),
+		.time	= sched_clock(),
+		.id	= counter->id,
 	};
 
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
@@ -2747,24 +2973,45 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+			  struct perf_sample_data *data)
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 	int ret = 0;
 
 	if (!throttle) {
-		counter->hw.interrupts++;
-	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
-		counter->hw.interrupts++;
-		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-			counter->hw.interrupts = MAX_INTERRUPTS;
-			perf_log_throttle(counter, 0);
+		hwc->interrupts++;
+	} else {
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts >
+					(u64)sysctl_perf_counter_sample_rate) {
+				hwc->interrupts = MAX_INTERRUPTS;
+				perf_log_throttle(counter, 0);
+				ret = 1;
+			}
+		} else {
+			/*
+			 * Keep re-disabling counters even though on the previous
+			 * pass we disabled it - just in case we raced with a
+			 * sched-in and the counter got enabled again:
+			 */
 			ret = 1;
 		}
 	}
 
+	if (counter->attr.freq) {
+		u64 now = sched_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+	}
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -2782,7 +3029,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs, addr);
+	perf_counter_output(counter, nmi, data);
 	return ret;
 }
 
@@ -2812,16 +3059,18 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = hwc->sample_period;
 
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_add(period, &hwc->period_left);
+		hwc->last_period = period;
 	}
 
 	atomic64_set(&hwc->prev_count, -left);
@@ -2831,28 +3080,29 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
 	struct perf_counter *counter;
-	struct pt_regs *regs;
 	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
 
-	regs = get_irq_regs();
+	data.addr = 0;
+	data.regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->hw_event.exclude_kernel || !regs) &&
-			!counter->hw_event.exclude_user)
-		regs = task_pt_regs(current);
+	if ((counter->attr.exclude_kernel || !data.regs) &&
+			!counter->attr.exclude_user)
+		data.regs = task_pt_regs(current);
 
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs, 0))
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
 			ret = HRTIMER_NORESTART;
 	}
 
-	period = max_t(u64, 10000, counter->hw.irq_period);
+	period = max_t(u64, 10000, counter->hw.sample_period);
 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
@@ -2861,9 +3111,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
+	struct perf_sample_data data = {
+		.regs	= regs,
+		.addr	= addr,
+		.period	= counter->hw.last_period,
+	};
+
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs, addr))
+	if (perf_counter_overflow(counter, nmi, &data))
 		/* soft-disable the counter */
 		;
 
@@ -2908,24 +3164,24 @@ static int perf_swcounter_is_counting(struct perf_counter *counter)
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum perf_event_types type,
+				enum perf_type_id type,
 				u32 event, struct pt_regs *regs)
 {
-	u64 event_config;
-
-	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
-
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->hw_event.config != event_config)
+	if (counter->attr.type != type)
 		return 0;
-
-	if (counter->hw_event.exclude_user && user_mode(regs))
+	if (counter->attr.config != event)
 		return 0;
 
-	if (counter->hw_event.exclude_kernel && !user_mode(regs))
-		return 0;
+	if (regs) {
+		if (counter->attr.exclude_user && user_mode(regs))
+			return 0;
+
+		if (counter->attr.exclude_kernel && !user_mode(regs))
+			return 0;
+	}
 
 	return 1;
 }
@@ -2935,12 +3191,12 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg)
+	if (counter->hw.sample_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_event_types type, u32 event,
+				     enum perf_type_id type, u32 event,
 				     u64 nr, int nmi, struct pt_regs *regs,
 				     u64 addr)
 {
@@ -2971,7 +3227,7 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 	return &cpuctx->recursion[0];
 }
 
-static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs,
 				   u64 addr)
 {
@@ -3056,8 +3312,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3068,7 +3324,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
@@ -3108,8 +3364,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3120,7 +3376,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
@@ -3151,55 +3407,24 @@ static const struct pmu perf_ops_task_clock = {
 /*
  * Software counter: cpu migrations
  */
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->se.nr_migrations;
-	return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
 {
-	cpu_migrations_perf_counter_update(counter);
-}
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx;
 
-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_cpu_migrations(counter));
-	return 0;
-}
+	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+				 PERF_COUNT_SW_CPU_MIGRATIONS,
+				 1, 1, NULL, 0);
 
-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
-	cpu_migrations_perf_counter_update(counter);
+	ctx = perf_pin_task_context(task);
+	if (ctx) {
+		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+					 PERF_COUNT_SW_CPU_MIGRATIONS,
+					 1, 1, NULL, 0);
+		perf_unpin_context(ctx);
+	}
 }
 
-static const struct pmu perf_ops_cpu_migrations = {
-	.enable		= cpu_migrations_perf_counter_enable,
-	.disable	= cpu_migrations_perf_counter_disable,
-	.read		= cpu_migrations_perf_counter_read,
-};
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
@@ -3217,12 +3442,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->hw_event));
+	ftrace_profile_disable(perf_event_id(&counter->attr));
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->hw_event);
+	int event_id = perf_event_id(&counter->attr);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -3230,7 +3455,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.irq_period = counter->hw_event.irq_period;
 
 	return &perf_ops_generic;
 }
@@ -3252,12 +3476,12 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->hw_event)) {
-	case PERF_COUNT_CPU_CLOCK:
+	switch (counter->attr.config) {
+	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_TASK_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -3268,16 +3492,13 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 			pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_PAGE_FAULTS:
-	case PERF_COUNT_PAGE_FAULTS_MIN:
-	case PERF_COUNT_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
 		pmu = &perf_ops_generic;
 		break;
-	case PERF_COUNT_CPU_MIGRATIONS:
-		if (!counter->hw_event.exclude_kernel)
-			pmu = &perf_ops_cpu_migrations;
-		break;
 	}
 
 	return pmu;
@@ -3287,7 +3508,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+perf_counter_alloc(struct perf_counter_attr *attr,
 		   int cpu,
 		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
@@ -3319,38 +3540,44 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mmap_mutex);
 
-	counter->cpu			= cpu;
-	counter->hw_event		= *hw_event;
-	counter->group_leader		= group_leader;
-	counter->pmu			= NULL;
-	counter->ctx			= ctx;
-	counter->oncpu			= -1;
+	counter->cpu		= cpu;
+	counter->attr		= *attr;
+	counter->group_leader	= group_leader;
+	counter->pmu		= NULL;
+	counter->ctx		= ctx;
+	counter->oncpu		= -1;
 
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (hw_event->disabled)
+	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	counter->id		= atomic64_inc_return(&perf_counter_id);
+
+	counter->state		= PERF_COUNTER_STATE_INACTIVE;
+
+	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
-	else
-		hwc->irq_period = hw_event->irq_period;
+	hwc->sample_period = attr->sample_period;
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = 1;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
-	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(hw_event)) {
+	if (attr->type == PERF_TYPE_RAW) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(hw_event)) {
+	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
 		pmu = hw_perf_counter_init(counter);
 		break;
 
@@ -3370,6 +3597,8 @@ done:
 		err = PTR_ERR(pmu);
 
 	if (err) {
+		if (counter->ns)
+			put_pid_ns(counter->ns);
 		kfree(counter);
 		return ERR_PTR(err);
 	}
@@ -3377,12 +3606,10 @@ done:
 	counter->pmu = pmu;
 
 	atomic_inc(&nr_counters);
-	if (counter->hw_event.mmap)
-		atomic_inc(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
-		atomic_inc(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
-		atomic_inc(&nr_comm_tracking);
+	if (counter->attr.mmap)
+		atomic_inc(&nr_mmap_counters);
+	if (counter->attr.comm)
+		atomic_inc(&nr_comm_counters);
 
 	return counter;
 }
@@ -3390,17 +3617,17 @@ done:
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
- * @hw_event_uptr:	event type attributes for monitoring/sampling
+ * @attr_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		const struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	struct perf_counter_context *ctx;
 	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
@@ -3412,9 +3639,19 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
+	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
+	if (attr.freq) {
+		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+			return -EINVAL;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
@@ -3450,11 +3687,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
-		if (hw_event.exclusive || hw_event.pinned)
+		if (attr.exclusive || attr.pinned)
 			goto err_put_context;
 	}
 
-	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
 				     GFP_KERNEL);
 	ret = PTR_ERR(counter);
 	if (IS_ERR(counter))
@@ -3519,7 +3756,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (parent_counter->parent)
 		parent_counter = parent_counter->parent;
 
-	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+	child_counter = perf_counter_alloc(&parent_counter->attr,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
@@ -3528,7 +3765,7 @@ inherit_counter(struct perf_counter *parent_counter,
 
 	/*
 	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
 	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
@@ -3536,6 +3773,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	else
 		child_counter->state = PERF_COUNTER_STATE_OFF;
 
+	if (parent_counter->attr.freq)
+		child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
 	/*
 	 * Link it up in the child's context:
 	 */
@@ -3545,7 +3785,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * inherit into child's child as well:
 	 */
-	child_counter->hw_event.inherit = 1;
+	child_counter->attr.inherit = 1;
 
 	/*
 	 * Get a reference to the parent filp - we will fput it
@@ -3682,7 +3922,18 @@ void perf_counter_exit_task(struct task_struct *child)
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
-	mutex_lock(&child_ctx->mutex);
+	/*
+	 * We can recurse on the same lock type through:
+	 *
+	 *   __perf_counter_exit_task()
+	 *     sync_child_counter()
+	 *       fput(parent_counter->filp)
+	 *         perf_release()
+	 *           mutex_lock(&ctx->mutex)
+	 *
+	 * But since its the parent context it won't be the same instance.
+	 */
+	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
@@ -3801,7 +4052,7 @@ int perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit) {
+		if (!counter->attr.inherit) {
 			inherited_all = 0;
 			continue;
 		}
@@ -3902,8 +4153,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
 static struct notifier_block __cpuinitdata perf_cpu_nb = {
 	.notifier_call		= perf_cpu_notify,
+	.priority		= 20,
 };
 
 void __init perf_counter_init(void)