bpf: Adapt copy_map_value for multiple offset case
authorKumar Kartikeya Dwivedi <memxor@gmail.com>
Sun, 24 Apr 2022 21:48:53 +0000 (03:18 +0530)
committerAlexei Starovoitov <ast@kernel.org>
Tue, 26 Apr 2022 03:26:44 +0000 (20:26 -0700)
Since now there might be at most 10 offsets that need handling in
copy_map_value, the manual shuffling and special case is no longer going
to work. Hence, let's generalise the copy_map_value function by using
a sorted array of offsets to skip regions that must be avoided while
copying into and out of a map value.

When the map is created, we populate the offset array in struct map,
Then, copy_map_value uses this sorted offset array is used to memcpy
while skipping timer, spin lock, and kptr. The array is allocated as
in most cases none of these special fields would be present in map
value, hence we can save on space for the common case by not embedding
the entire object inside bpf_map struct.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-6-memxor@gmail.com
include/linux/bpf.h
kernel/bpf/syscall.c

index e13a5cbd4ebb551ef2a7322b104c5f3a9ef11bb0..4e12b27a5de6b4f2ef839b83bf3d591ef2e0649b 100644 (file)
@@ -158,6 +158,9 @@ struct bpf_map_ops {
 enum {
        /* Support at most 8 pointers in a BPF map value */
        BPF_MAP_VALUE_OFF_MAX = 8,
+       BPF_MAP_OFF_ARR_MAX   = BPF_MAP_VALUE_OFF_MAX +
+                               1 + /* for bpf_spin_lock */
+                               1,  /* for bpf_timer */
 };
 
 enum bpf_kptr_type {
@@ -179,6 +182,12 @@ struct bpf_map_value_off {
        struct bpf_map_value_off_desc off[];
 };
 
+struct bpf_map_off_arr {
+       u32 cnt;
+       u32 field_off[BPF_MAP_OFF_ARR_MAX];
+       u8 field_sz[BPF_MAP_OFF_ARR_MAX];
+};
+
 struct bpf_map {
        /* The first two cachelines with read-mostly members of which some
         * are also accessed in fast-path (e.g. ops, max_entries).
@@ -207,10 +216,7 @@ struct bpf_map {
        struct mem_cgroup *memcg;
 #endif
        char name[BPF_OBJ_NAME_LEN];
-       bool bypass_spec_v1;
-       bool frozen; /* write-once; write-protected by freeze_mutex */
-       /* 6 bytes hole */
-
+       struct bpf_map_off_arr *off_arr;
        /* The 3rd and 4th cacheline with misc members to avoid false sharing
         * particularly with refcounting.
         */
@@ -230,6 +236,8 @@ struct bpf_map {
                bool jited;
                bool xdp_has_frags;
        } owner;
+       bool bypass_spec_v1;
+       bool frozen; /* write-once; write-protected by freeze_mutex */
 };
 
 static inline bool map_value_has_spin_lock(const struct bpf_map *map)
@@ -253,37 +261,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
                memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
        if (unlikely(map_value_has_timer(map)))
                memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
+       if (unlikely(map_value_has_kptrs(map))) {
+               struct bpf_map_value_off *tab = map->kptr_off_tab;
+               int i;
+
+               for (i = 0; i < tab->nr_off; i++)
+                       *(u64 *)(dst + tab->off[i].offset) = 0;
+       }
 }
 
 /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
-       u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
+       u32 curr_off = 0;
+       int i;
 
-       if (unlikely(map_value_has_spin_lock(map))) {
-               s_off = map->spin_lock_off;
-               s_sz = sizeof(struct bpf_spin_lock);
-       }
-       if (unlikely(map_value_has_timer(map))) {
-               t_off = map->timer_off;
-               t_sz = sizeof(struct bpf_timer);
+       if (likely(!map->off_arr)) {
+               memcpy(dst, src, map->value_size);
+               return;
        }
 
-       if (unlikely(s_sz || t_sz)) {
-               if (s_off < t_off || !s_sz) {
-                       swap(s_off, t_off);
-                       swap(s_sz, t_sz);
-               }
-               memcpy(dst, src, t_off);
-               memcpy(dst + t_off + t_sz,
-                      src + t_off + t_sz,
-                      s_off - t_off - t_sz);
-               memcpy(dst + s_off + s_sz,
-                      src + s_off + s_sz,
-                      map->value_size - s_off - s_sz);
-       } else {
-               memcpy(dst, src, map->value_size);
+       for (i = 0; i < map->off_arr->cnt; i++) {
+               u32 next_off = map->off_arr->field_off[i];
+
+               memcpy(dst + curr_off, src + curr_off, next_off - curr_off);
+               curr_off += map->off_arr->field_sz[i];
        }
+       memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off);
 }
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
                           bool lock_src);
index 575b09339360ad9689d2f4edbb7705e4ffe327c0..4d419a3effc4ed403facdca42a1b4b83a8c7e6df 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/pgtable.h>
 #include <linux/bpf_lsm.h>
 #include <linux/poll.h>
+#include <linux/sort.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
@@ -551,6 +552,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
        struct bpf_map *map = container_of(work, struct bpf_map, work);
 
        security_bpf_map_free(map);
+       kfree(map->off_arr);
        bpf_map_free_kptr_off_tab(map);
        bpf_map_release_memcg(map);
        /* implementation dependent freeing */
@@ -840,6 +842,84 @@ int map_check_no_btf(const struct bpf_map *map,
        return -ENOTSUPP;
 }
 
+static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
+{
+       const u32 a = *(const u32 *)_a;
+       const u32 b = *(const u32 *)_b;
+
+       if (a < b)
+               return -1;
+       else if (a > b)
+               return 1;
+       return 0;
+}
+
+static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
+{
+       struct bpf_map *map = (struct bpf_map *)priv;
+       u32 *off_base = map->off_arr->field_off;
+       u32 *a = _a, *b = _b;
+       u8 *sz_a, *sz_b;
+
+       sz_a = map->off_arr->field_sz + (a - off_base);
+       sz_b = map->off_arr->field_sz + (b - off_base);
+
+       swap(*a, *b);
+       swap(*sz_a, *sz_b);
+}
+
+static int bpf_map_alloc_off_arr(struct bpf_map *map)
+{
+       bool has_spin_lock = map_value_has_spin_lock(map);
+       bool has_timer = map_value_has_timer(map);
+       bool has_kptrs = map_value_has_kptrs(map);
+       struct bpf_map_off_arr *off_arr;
+       u32 i;
+
+       if (!has_spin_lock && !has_timer && !has_kptrs) {
+               map->off_arr = NULL;
+               return 0;
+       }
+
+       off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
+       if (!off_arr)
+               return -ENOMEM;
+       map->off_arr = off_arr;
+
+       off_arr->cnt = 0;
+       if (has_spin_lock) {
+               i = off_arr->cnt;
+
+               off_arr->field_off[i] = map->spin_lock_off;
+               off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
+               off_arr->cnt++;
+       }
+       if (has_timer) {
+               i = off_arr->cnt;
+
+               off_arr->field_off[i] = map->timer_off;
+               off_arr->field_sz[i] = sizeof(struct bpf_timer);
+               off_arr->cnt++;
+       }
+       if (has_kptrs) {
+               struct bpf_map_value_off *tab = map->kptr_off_tab;
+               u32 *off = &off_arr->field_off[off_arr->cnt];
+               u8 *sz = &off_arr->field_sz[off_arr->cnt];
+
+               for (i = 0; i < tab->nr_off; i++) {
+                       *off++ = tab->off[i].offset;
+                       *sz++ = sizeof(u64);
+               }
+               off_arr->cnt += tab->nr_off;
+       }
+
+       if (off_arr->cnt == 1)
+               return 0;
+       sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
+              map_off_arr_cmp, map_off_arr_swap, map);
+       return 0;
+}
+
 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
                         u32 btf_key_id, u32 btf_value_id)
 {
@@ -1009,10 +1089,14 @@ static int map_create(union bpf_attr *attr)
                        attr->btf_vmlinux_value_type_id;
        }
 
-       err = security_bpf_map_alloc(map);
+       err = bpf_map_alloc_off_arr(map);
        if (err)
                goto free_map;
 
+       err = security_bpf_map_alloc(map);
+       if (err)
+               goto free_map_off_arr;
+
        err = bpf_map_alloc_id(map);
        if (err)
                goto free_map_sec;
@@ -1035,6 +1119,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map_sec:
        security_bpf_map_free(map);
+free_map_off_arr:
+       kfree(map->off_arr);
 free_map:
        btf_put(map->btf);
        map->ops->map_free(map);