bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc.

author Alexei Starovoitov <ast@kernel.org>

Fri, 2 Sep 2022 21:10:58 +0000 (14:10 -0700)

committer Daniel Borkmann <daniel@iogearbox.net>

Mon, 5 Sep 2022 13:33:07 +0000 (15:33 +0200)
author Alexei Starovoitov <ast@kernel.org>
Fri, 2 Sep 2022 21:10:58 +0000 (14:10 -0700)
committer Daniel Borkmann <daniel@iogearbox.net>
Mon, 5 Sep 2022 13:33:07 +0000 (15:33 +0200)
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h

index 653ed15..3e164b8 100644 (file)
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -3,6 +3,7 @@
  #ifndef _BPF_MEM_ALLOC_H
  #define _BPF_MEM_ALLOC_H
  #include <linux/compiler_types.h>
+#include <linux/workqueue.h>
  
  struct bpf_mem_cache;
  struct bpf_mem_caches;
@@ -10,6 +11,7 @@ struct bpf_mem_caches;
  struct bpf_mem_alloc {
         struct bpf_mem_caches __percpu *caches;
         struct bpf_mem_cache __percpu *cache;
+       struct work_struct work;
  };
  
  int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c

index a77b9c4..0fe3f13 100644 (file)
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1546,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map)
          * There is no need to synchronize_rcu() here to protect map elements.
          */
  
-       /* some of free_htab_elem() callbacks for elements of this map may
-        * not have executed. Wait for them.
+       /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+        * underneath and is reponsible for waiting for callbacks to finish
+        * during bpf_mem_alloc_destroy().
          */
-       rcu_barrier();
         if (!htab_is_prealloc(htab)) {
                 delete_all_elements(htab);
         } else {
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c

index 38fbd15..5cc952d 100644 (file)
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -414,10 +414,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
  {
         struct llist_node *llnode, *t;
  
-       /* The caller has done rcu_barrier() and no progs are using this
-        * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for
-        * all remaining elements and they can be in free_by_rcu or in
-        * waiting_for_gp lists, so drain those lists now.
+       /* No progs are using this bpf_mem_cache, but htab_map_free() called
+        * bpf_mem_cache_free() for all remaining elements and they can be in
+        * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
          */
         llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
                 free_one(c, llnode);
@@ -429,42 +428,91 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
                 free_one(c, llnode);
  }
  
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+       free_percpu(ma->cache);
+       free_percpu(ma->caches);
+       ma->cache = NULL;
+       ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+       /* waiting_for_gp lists was drained, but __free_rcu might
+        * still execute. Wait for it now before we freeing percpu caches.
+        */
+       rcu_barrier_tasks_trace();
+       rcu_barrier();
+       free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+       struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+       free_mem_alloc(ma);
+       kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+       struct bpf_mem_alloc *copy;
+
+       if (!rcu_in_progress) {
+               /* Fast path. No callbacks are pending, hence no need to do
+                * rcu_barrier-s.
+                */
+               free_mem_alloc_no_barrier(ma);
+               return;
+       }
+
+       copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+       if (!copy) {
+               /* Slow path with inline barrier-s */
+               free_mem_alloc(ma);
+               return;
+       }
+
+       /* Defer barriers into worker to let the rest of map memory to be freed */
+       copy->cache = ma->cache;
+       ma->cache = NULL;
+       copy->caches = ma->caches;
+       ma->caches = NULL;
+       INIT_WORK(&copy->work, free_mem_alloc_deferred);
+       queue_work(system_unbound_wq, &copy->work);
+}
+
  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
  {
         struct bpf_mem_caches *cc;
         struct bpf_mem_cache *c;
-       int cpu, i;
+       int cpu, i, rcu_in_progress;
  
         if (ma->cache) {
+               rcu_in_progress = 0;
                 for_each_possible_cpu(cpu) {
                         c = per_cpu_ptr(ma->cache, cpu);
                         drain_mem_cache(c);
+                       rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
                 }
                 /* objcg is the same across cpus */
                 if (c->objcg)
                         obj_cgroup_put(c->objcg);
-               /* c->waiting_for_gp list was drained, but __free_rcu might
-                * still execute. Wait for it now before we free 'c'.
-                */
-               rcu_barrier_tasks_trace();
-               rcu_barrier();
-               free_percpu(ma->cache);
-               ma->cache = NULL;
+               destroy_mem_alloc(ma, rcu_in_progress);
         }
         if (ma->caches) {
+               rcu_in_progress = 0;
                 for_each_possible_cpu(cpu) {
                         cc = per_cpu_ptr(ma->caches, cpu);
                         for (i = 0; i < NUM_CACHES; i++) {
                                 c = &cc->cache[i];
                                 drain_mem_cache(c);
+                               rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
                         }
                 }
                 if (c->objcg)
                         obj_cgroup_put(c->objcg);
-               rcu_barrier_tasks_trace();
-               rcu_barrier();
-               free_percpu(ma->caches);
-               ma->caches = NULL;
+               destroy_mem_alloc(ma, rcu_in_progress);
         }
  }
author	Alexei Starovoitov <ast@kernel.org>
	Fri, 2 Sep 2022 21:10:58 +0000 (14:10 -0700)
committer	Daniel Borkmann <daniel@iogearbox.net>
	Mon, 5 Sep 2022 13:33:07 +0000 (15:33 +0200)
include/linux/bpf_mem_alloc.h		patch \| blob \| history
kernel/bpf/hashtab.c		patch \| blob \| history
kernel/bpf/memalloc.c		patch \| blob \| history