bpf: Change bpf_mem_cache draining process.
authorAlexei Starovoitov <ast@kernel.org>
Thu, 6 Jul 2023 03:34:40 +0000 (20:34 -0700)
committerDaniel Borkmann <daniel@iogearbox.net>
Wed, 12 Jul 2023 21:45:22 +0000 (23:45 +0200)
The next patch will introduce cross-cpu llist access and existing
irq_work_sync() + drain_mem_cache() + rcu_barrier_tasks_trace() mechanism will
not be enough, since irq_work_sync() + drain_mem_cache() on cpu A won't
guarantee that llist on cpu A are empty. The free_bulk() on cpu B might add
objects back to llist of cpu A. Add 'bool draining' flag.
The modified sequence looks like:
for_each_cpu:
  WRITE_ONCE(c->draining, true); // do_call_rcu_ttrace() won't be doing call_rcu() any more
  irq_work_sync(); // wait for irq_work callback (free_bulk) to finish
  drain_mem_cache(); // free all objects
rcu_barrier_tasks_trace(); // wait for RCU callbacks to execute

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20230706033447.54696-8-alexei.starovoitov@gmail.com
kernel/bpf/memalloc.c

index 0ee566a7719a90520825e8683377e4d9241d732f..2615f296f052d9329a0af737f05bf6a77fb6b0fa 100644 (file)
@@ -98,6 +98,7 @@ struct bpf_mem_cache {
        int free_cnt;
        int low_watermark, high_watermark, batch;
        int percpu_size;
+       bool draining;
 
        /* list of objects to be freed after RCU tasks trace GP */
        struct llist_head free_by_rcu_ttrace;
@@ -301,6 +302,12 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
                 * from __free_rcu() and from drain_mem_cache().
                 */
                __llist_add(llnode, &c->waiting_for_gp_ttrace);
+
+       if (unlikely(READ_ONCE(c->draining))) {
+               __free_rcu(&c->rcu_ttrace);
+               return;
+       }
+
        /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
         * If RCU Tasks Trace grace period implies RCU grace period, free
         * these elements directly, else use call_rcu() to wait for normal
@@ -544,15 +551,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
                rcu_in_progress = 0;
                for_each_possible_cpu(cpu) {
                        c = per_cpu_ptr(ma->cache, cpu);
-                       /*
-                        * refill_work may be unfinished for PREEMPT_RT kernel
-                        * in which irq work is invoked in a per-CPU RT thread.
-                        * It is also possible for kernel with
-                        * arch_irq_work_has_interrupt() being false and irq
-                        * work is invoked in timer interrupt. So waiting for
-                        * the completion of irq work to ease the handling of
-                        * concurrency.
-                        */
+                       WRITE_ONCE(c->draining, true);
                        irq_work_sync(&c->refill_work);
                        drain_mem_cache(c);
                        rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
@@ -568,6 +567,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
                        cc = per_cpu_ptr(ma->caches, cpu);
                        for (i = 0; i < NUM_CACHES; i++) {
                                c = &cc->cache[i];
+                               WRITE_ONCE(c->draining, true);
                                irq_work_sync(&c->refill_work);
                                drain_mem_cache(c);
                                rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);