bpf: Add lookup_and_delete_elem support to hashtab
authorDenis Salopek <denis.salopek@sartura.hr>
Tue, 11 May 2021 21:00:04 +0000 (23:00 +0200)
committerAndrii Nakryiko <andrii@kernel.org>
Mon, 24 May 2021 20:30:26 +0000 (13:30 -0700)
Extend the existing bpf_map_lookup_and_delete_elem() functionality to
hashtab map types, in addition to stacks and queues.
Create a new hashtab bpf_map_ops function that does lookup and deletion
of the element under the same bucket lock and add the created map_ops to
bpf.h.

Signed-off-by: Denis Salopek <denis.salopek@sartura.hr>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/4d18480a3e990ffbf14751ddef0325eed3be2966.1620763117.git.denis.salopek@sartura.hr
include/linux/bpf.h
include/uapi/linux/bpf.h
kernel/bpf/hashtab.c
kernel/bpf/syscall.c
tools/include/uapi/linux/bpf.h

index 9dc44ba..1e9a0ff 100644 (file)
@@ -70,6 +70,8 @@ struct bpf_map_ops {
        void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
        int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
+       int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
+                                         void *value, u64 flags);
        int (*map_lookup_and_delete_batch)(struct bpf_map *map,
                                           const union bpf_attr *attr,
                                           union bpf_attr __user *uattr);
index 418b9b8..562adea 100644 (file)
@@ -527,6 +527,15 @@ union bpf_iter_link_info {
  *             Look up an element with the given *key* in the map referred to
  *             by the file descriptor *fd*, and if found, delete the element.
  *
+ *             For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ *             types, the *flags* argument needs to be set to 0, but for other
+ *             map types, it may be specified as:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up and delete the value of a spin-locked map
+ *                     without returning the lock. This must be specified if
+ *                     the elements contain a spinlock.
+ *
  *             The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
  *             implement this command as a "pop" operation, deleting the top
  *             element rather than one corresponding to *key*.
@@ -536,6 +545,10 @@ union bpf_iter_link_info {
  *             This command is only valid for the following map types:
  *             * **BPF_MAP_TYPE_QUEUE**
  *             * **BPF_MAP_TYPE_STACK**
+ *             * **BPF_MAP_TYPE_HASH**
+ *             * **BPF_MAP_TYPE_PERCPU_HASH**
+ *             * **BPF_MAP_TYPE_LRU_HASH**
+ *             * **BPF_MAP_TYPE_LRU_PERCPU_HASH**
  *
  *     Return
  *             Returns zero on success. On error, -1 is returned and *errno*
index d7ebb12..9da0a04 100644 (file)
@@ -1401,6 +1401,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
        rcu_read_unlock();
 }
 
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+                                            void *value, bool is_lru_map,
+                                            bool is_percpu, u64 flags)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_nulls_head *head;
+       unsigned long bflags;
+       struct htab_elem *l;
+       u32 hash, key_size;
+       struct bucket *b;
+       int ret;
+
+       key_size = map->key_size;
+
+       hash = htab_map_hash(key, key_size, htab->hashrnd);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       ret = htab_lock_bucket(htab, b, hash, &bflags);
+       if (ret)
+               return ret;
+
+       l = lookup_elem_raw(head, hash, key, key_size);
+       if (!l) {
+               ret = -ENOENT;
+       } else {
+               if (is_percpu) {
+                       u32 roundup_value_size = round_up(map->value_size, 8);
+                       void __percpu *pptr;
+                       int off = 0, cpu;
+
+                       pptr = htab_elem_get_ptr(l, key_size);
+                       for_each_possible_cpu(cpu) {
+                               bpf_long_memcpy(value + off,
+                                               per_cpu_ptr(pptr, cpu),
+                                               roundup_value_size);
+                               off += roundup_value_size;
+                       }
+               } else {
+                       u32 roundup_key_size = round_up(map->key_size, 8);
+
+                       if (flags & BPF_F_LOCK)
+                               copy_map_value_locked(map, value, l->key +
+                                                     roundup_key_size,
+                                                     true);
+                       else
+                               copy_map_value(map, value, l->key +
+                                              roundup_key_size);
+                       check_and_init_map_lock(map, value);
+               }
+
+               hlist_nulls_del_rcu(&l->hash_node);
+               if (!is_lru_map)
+                       free_htab_elem(htab, l);
+       }
+
+       htab_unlock_bucket(htab, b, hash, bflags);
+
+       if (is_lru_map && l)
+               bpf_lru_push_free(&htab->lru, &l->lru_node);
+
+       return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+                                          void *value, u64 flags)
+{
+       return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+                                                flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+                                                 void *key, void *value,
+                                                 u64 flags)
+{
+       return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+                                                flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+                                              void *value, u64 flags)
+{
+       return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+                                                flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+                                                     void *key, void *value,
+                                                     u64 flags)
+{
+       return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+                                                flags);
+}
+
 static int
 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
                                   const union bpf_attr *attr,
@@ -1934,6 +2028,7 @@ const struct bpf_map_ops htab_map_ops = {
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_map_lookup_elem,
+       .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
        .map_update_elem = htab_map_update_elem,
        .map_delete_elem = htab_map_delete_elem,
        .map_gen_lookup = htab_map_gen_lookup,
@@ -1954,6 +2049,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_lru_map_lookup_elem,
+       .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
        .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
        .map_update_elem = htab_lru_map_update_elem,
        .map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2173,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_percpu_map_lookup_elem,
+       .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
        .map_update_elem = htab_percpu_map_update_elem,
        .map_delete_elem = htab_map_delete_elem,
        .map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2193,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+       .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
        .map_update_elem = htab_lru_percpu_map_update_elem,
        .map_delete_elem = htab_lru_map_delete_elem,
        .map_seq_show_elem = htab_percpu_map_seq_show_elem,
index 1d1cd80..5045701 100644 (file)
@@ -1483,7 +1483,7 @@ free_buf:
        return err;
 }
 
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
 
 static int map_lookup_and_delete_elem(union bpf_attr *attr)
 {
@@ -1499,6 +1499,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
        if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
                return -EINVAL;
 
+       if (attr->flags & ~BPF_F_LOCK)
+               return -EINVAL;
+
        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
@@ -1509,24 +1512,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
                goto err_put;
        }
 
+       if (attr->flags &&
+           (map->map_type == BPF_MAP_TYPE_QUEUE ||
+            map->map_type == BPF_MAP_TYPE_STACK)) {
+               err = -EINVAL;
+               goto err_put;
+       }
+
+       if ((attr->flags & BPF_F_LOCK) &&
+           !map_value_has_spin_lock(map)) {
+               err = -EINVAL;
+               goto err_put;
+       }
+
        key = __bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }
 
-       value_size = map->value_size;
+       value_size = bpf_map_value_size(map);
 
        err = -ENOMEM;
        value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;
 
+       err = -ENOTSUPP;
        if (map->map_type == BPF_MAP_TYPE_QUEUE ||
            map->map_type == BPF_MAP_TYPE_STACK) {
                err = map->ops->map_pop_elem(map, value);
-       } else {
-               err = -ENOTSUPP;
+       } else if (map->map_type == BPF_MAP_TYPE_HASH ||
+                  map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+                  map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+                  map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+               if (!bpf_map_is_dev_bound(map)) {
+                       bpf_disable_instrumentation();
+                       rcu_read_lock();
+                       err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+                       rcu_read_unlock();
+                       bpf_enable_instrumentation();
+               }
        }
 
        if (err)
index 418b9b8..562adea 100644 (file)
@@ -527,6 +527,15 @@ union bpf_iter_link_info {
  *             Look up an element with the given *key* in the map referred to
  *             by the file descriptor *fd*, and if found, delete the element.
  *
+ *             For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ *             types, the *flags* argument needs to be set to 0, but for other
+ *             map types, it may be specified as:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up and delete the value of a spin-locked map
+ *                     without returning the lock. This must be specified if
+ *                     the elements contain a spinlock.
+ *
  *             The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
  *             implement this command as a "pop" operation, deleting the top
  *             element rather than one corresponding to *key*.
@@ -536,6 +545,10 @@ union bpf_iter_link_info {
  *             This command is only valid for the following map types:
  *             * **BPF_MAP_TYPE_QUEUE**
  *             * **BPF_MAP_TYPE_STACK**
+ *             * **BPF_MAP_TYPE_HASH**
+ *             * **BPF_MAP_TYPE_PERCPU_HASH**
+ *             * **BPF_MAP_TYPE_LRU_HASH**
+ *             * **BPF_MAP_TYPE_LRU_PERCPU_HASH**
  *
  *     Return
  *             Returns zero on success. On error, -1 is returned and *errno*