bpf: Check map->usercnt after timer->timer is assigned
authorHou Tao <houtao1@huawei.com>
Mon, 30 Oct 2023 06:36:16 +0000 (14:36 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 20 Nov 2023 10:59:34 +0000 (11:59 +0100)
[ Upstream commit fd381ce60a2d79cc967506208085336d3d268ae0 ]

When there are concurrent uref release and bpf timer init operations,
the following sequence diagram is possible. It will break the guarantee
provided by bpf_timer: bpf_timer will still be alive after userspace
application releases or unpins the map. It also will lead to kmemleak
for old kernel version which doesn't release bpf_timer when map is
released.

bpf program X:

bpf_timer_init()
  lock timer->lock
    read timer->timer as NULL
    read map->usercnt != 0

                process Y:

                close(map_fd)
                  // put last uref
                  bpf_map_put_uref()
                    atomic_dec_and_test(map->usercnt)
                      array_map_free_timers()
                        bpf_timer_cancel_and_free()
                          // just return
                          read timer->timer is NULL

    t = bpf_map_kmalloc_node()
    timer->timer = t
  unlock timer->lock

Fix the problem by checking map->usercnt after timer->timer is assigned,
so when there are concurrent uref release and bpf timer init, either
bpf_timer_cancel_and_free() from uref release reads a no-NULL timer
or the newly-added atomic64_read() returns a zero usercnt.

Because atomic_dec_and_test(map->usercnt) and READ_ONCE(timer->timer)
in bpf_timer_cancel_and_free() are not protected by a lock, so add
a memory barrier to guarantee the order between map->usercnt and
timer->timer. Also use WRITE_ONCE(timer->timer, x) to match the lockless
read of timer->timer in bpf_timer_cancel_and_free().

Reported-by: Hsin-Wei Hung <hsinweih@uci.edu>
Closes: https://lore.kernel.org/bpf/CABcoxUaT2k9hWsS1tNgXyoU3E-=PuOgMn737qK984fbFmfYixQ@mail.gmail.com
Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231030063616.1653024-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
kernel/bpf/helpers.c

index 68f54e1..607be04 100644 (file)
@@ -1176,13 +1176,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
                ret = -EBUSY;
                goto out;
        }
-       if (!atomic64_read(&map->usercnt)) {
-               /* maps with timers must be either held by user space
-                * or pinned in bpffs.
-                */
-               ret = -EPERM;
-               goto out;
-       }
        /* allocate hrtimer via map_kmalloc to use memcg accounting */
        t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
        if (!t) {
@@ -1195,7 +1188,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
        rcu_assign_pointer(t->callback_fn, NULL);
        hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
        t->timer.function = bpf_timer_cb;
-       timer->timer = t;
+       WRITE_ONCE(timer->timer, t);
+       /* Guarantee the order between timer->timer and map->usercnt. So
+        * when there are concurrent uref release and bpf timer init, either
+        * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+        * timer or atomic64_read() below returns a zero usercnt.
+        */
+       smp_mb();
+       if (!atomic64_read(&map->usercnt)) {
+               /* maps with timers must be either held by user space
+                * or pinned in bpffs.
+                */
+               WRITE_ONCE(timer->timer, NULL);
+               kfree(t);
+               ret = -EPERM;
+       }
 out:
        __bpf_spin_unlock_irqrestore(&timer->lock);
        return ret;
@@ -1370,7 +1377,7 @@ void bpf_timer_cancel_and_free(void *val)
        /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
         * this timer, since it won't be initialized.
         */
-       timer->timer = NULL;
+       WRITE_ONCE(timer->timer, NULL);
 out:
        __bpf_spin_unlock_irqrestore(&timer->lock);
        if (!t)