perf: Disallow mis-matched inherited group reads
authorPeter Zijlstra <peterz@infradead.org>
Wed, 18 Oct 2023 11:56:54 +0000 (13:56 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 19 Oct 2023 08:09:42 +0000 (10:09 +0200)
Because group consistency is non-atomic between parent (filedesc) and children
(inherited) events, it is possible for PERF_FORMAT_GROUP read() to try and sum
non-matching counter groups -- with non-sensical results.

Add group_generation to distinguish the case where a parent group removes and
adds an event and thus has the same number, but a different configuration of
events as inherited groups.

This became a problem when commit fa8c269353d5 ("perf/core: Invert
perf_read_group() loops") flipped the order of child_list and sibling_list.
Previously it would iterate the group (sibling_list) first, and for each
sibling traverse the child_list. In this order, only the group composition of
the parent is relevant. By flipping the order the group composition of the
child (inherited) events becomes an issue and the mis-match in group
composition becomes evident.

That said; even prior to this commit, while reading of a group that is not
equally inherited was not broken, it still made no sense.

(Ab)use ECHILD as error return to indicate issues with child process group
composition.

Fixes: fa8c269353d5 ("perf/core: Invert perf_read_group() loops")
Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231018115654.GK33217@noisy.programming.kicks-ass.net
include/linux/perf_event.h
kernel/events/core.c

index e85cd1c0eaf355ebb85c0b7c48b803283d91544e..7b5406e3288d98cdf42d1d5f10afef0928b73a4f 100644 (file)
@@ -704,6 +704,7 @@ struct perf_event {
        /* The cumulative AND of all event_caps for events in this group. */
        int                             group_caps;
 
+       unsigned int                    group_generation;
        struct perf_event               *group_leader;
        /*
         * event->pmu will always point to pmu in which this event belongs.
index 4c72a41f11afbfd8cd893f076176e17602cb78ab..d0663b9324e7a76b0ece07d74b0b4b0a7390e82b 100644 (file)
@@ -1954,6 +1954,7 @@ static void perf_group_attach(struct perf_event *event)
 
        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
+       group_leader->group_generation++;
 
        perf_event__header_size(group_leader);
 
@@ -2144,6 +2145,7 @@ static void perf_group_detach(struct perf_event *event)
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
+               event->group_leader->group_generation++;
                goto out;
        }
 
@@ -5440,7 +5442,7 @@ static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
 {
        struct perf_event_context *ctx = leader->ctx;
-       struct perf_event *sub;
+       struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;
@@ -5450,6 +5452,33 @@ static int __perf_read_group_add(struct perf_event *leader,
                return ret;
 
        raw_spin_lock_irqsave(&ctx->lock, flags);
+       /*
+        * Verify the grouping between the parent and child (inherited)
+        * events is still in tact.
+        *
+        * Specifically:
+        *  - leader->ctx->lock pins leader->sibling_list
+        *  - parent->child_mutex pins parent->child_list
+        *  - parent->ctx->mutex pins parent->sibling_list
+        *
+        * Because parent->ctx != leader->ctx (and child_list nests inside
+        * ctx->mutex), group destruction is not atomic between children, also
+        * see perf_event_release_kernel(). Additionally, parent can grow the
+        * group.
+        *
+        * Therefore it is possible to have parent and child groups in a
+        * different configuration and summing over such a beast makes no sense
+        * what so ever.
+        *
+        * Reject this.
+        */
+       parent = leader->parent;
+       if (parent &&
+           (parent->group_generation != leader->group_generation ||
+            parent->nr_siblings != leader->nr_siblings)) {
+               ret = -ECHILD;
+               goto unlock;
+       }
 
        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
@@ -5483,8 +5512,9 @@ static int __perf_read_group_add(struct perf_event *leader,
                        values[n++] = atomic64_read(&sub->lost_samples);
        }
 
+unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
-       return 0;
+       return ret;
 }
 
 static int perf_read_group(struct perf_event *event,
@@ -5503,10 +5533,6 @@ static int perf_read_group(struct perf_event *event,
 
        values[0] = 1 + leader->nr_siblings;
 
-       /*
-        * By locking the child_mutex of the leader we effectively
-        * lock the child list of all siblings.. XXX explain how.
-        */
        mutex_lock(&leader->child_mutex);
 
        ret = __perf_read_group_add(leader, read_format, values);
@@ -13346,6 +13372,7 @@ static int inherit_group(struct perf_event *parent_event,
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
+       leader->group_generation = parent_event->group_generation;
        return 0;
 }