bpf: INET_DIAG support in bpf_sk_storage
authorMartin KaFai Lau <kafai@fb.com>
Tue, 25 Feb 2020 23:04:21 +0000 (15:04 -0800)
committerAlexei Starovoitov <ast@kernel.org>
Fri, 28 Feb 2020 02:50:19 +0000 (18:50 -0800)
This patch adds INET_DIAG support to bpf_sk_storage.

1. Although this series adds bpf_sk_storage diag capability to inet sk,
   bpf_sk_storage is in general applicable to all fullsock.  Hence, the
   bpf_sk_storage logic will operate on SK_DIAG_* nlattr.  The caller
   will pass in its specific nesting nlattr (e.g. INET_DIAG_*) as
   the argument.

2. The request will be like:
INET_DIAG_REQ_SK_BPF_STORAGES (nla_nest) (defined in latter patch)
SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32)
SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32)
......

   Considering there could have multiple bpf_sk_storages in a sk,
   instead of reusing INET_DIAG_INFO ("ss -i"),  the user can select
   some specific bpf_sk_storage to dump by specifying an array of
   SK_DIAG_BPF_STORAGE_REQ_MAP_FD.

   If no SK_DIAG_BPF_STORAGE_REQ_MAP_FD is specified (i.e. an empty
   INET_DIAG_REQ_SK_BPF_STORAGES), it will dump all bpf_sk_storages
   of a sk.

3. The reply will be like:
INET_DIAG_BPF_SK_STORAGES (nla_nest) (defined in latter patch)
SK_DIAG_BPF_STORAGE (nla_nest)
SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
SK_DIAG_BPF_STORAGE (nla_nest)
SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
......

4. Unlike other INET_DIAG info of a sk which is pretty static, the size
   required to dump the bpf_sk_storage(s) of a sk is dynamic as the
   system adding more bpf_sk_storage_map.  It is hard to set a static
   min_dump_alloc size.

   Hence, this series learns it at the runtime and adjust the
   cb->min_dump_alloc as it iterates all sk(s) of a system.  The
   "unsigned int *res_diag_size" in bpf_sk_storage_diag_put()
   is for this purpose.

   The next patch will update the cb->min_dump_alloc as it
   iterates the sk(s).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200225230421.1975729-1-kafai@fb.com
include/linux/bpf.h
include/net/bpf_sk_storage.h
include/uapi/linux/sock_diag.h
kernel/bpf/syscall.c
net/core/bpf_sk_storage.c

index 9aa33b8f3d5523b6ef9f98e3c623072eed2fe70f..6015a4daf118cb24911addc59881778cb8f652ae 100644 (file)
@@ -1023,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 
+struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map);
index 8e4f831d2e52e59a12aa3acab43c0de85b39d6c7..5036c94c0503550eb634b07462987c31b88198d5 100644 (file)
@@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk);
 extern const struct bpf_func_proto bpf_sk_storage_get_proto;
 extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
 
+struct bpf_sk_storage_diag;
+struct sk_buff;
+struct nlattr;
+struct sock;
+
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs);
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag);
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+                           struct sock *sk, struct sk_buff *skb,
+                           int stg_array_type,
+                           unsigned int *res_diag_size);
 #else
 static inline int bpf_sk_storage_clone(const struct sock *sk,
                                       struct sock *newsk)
 {
        return 0;
 }
+static inline struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla)
+{
+       return NULL;
+}
+static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+}
+static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+                                         struct sock *sk, struct sk_buff *skb,
+                                         int stg_array_type,
+                                         unsigned int *res_diag_size)
+{
+       return 0;
+}
 #endif
 
 #endif /* _BPF_SK_STORAGE_H */
index e5925009a6529bee40facfdf332594aee5e156f1..5f74a5f6091d01c81700914c5cd53ce4cdfac61c 100644 (file)
@@ -36,4 +36,30 @@ enum sknetlink_groups {
 };
 #define SKNLGRP_MAX    (__SKNLGRP_MAX - 1)
 
+enum {
+       SK_DIAG_BPF_STORAGE_REQ_NONE,
+       SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+       __SK_DIAG_BPF_STORAGE_REQ_MAX,
+};
+
+#define SK_DIAG_BPF_STORAGE_REQ_MAX    (__SK_DIAG_BPF_STORAGE_REQ_MAX - 1)
+
+enum {
+       SK_DIAG_BPF_STORAGE_REP_NONE,
+       SK_DIAG_BPF_STORAGE,
+       __SK_DIAG_BPF_STORAGE_REP_MAX,
+};
+
+#define SK_DIAB_BPF_STORAGE_REP_MAX    (__SK_DIAG_BPF_STORAGE_REP_MAX - 1)
+
+enum {
+       SK_DIAG_BPF_STORAGE_NONE,
+       SK_DIAG_BPF_STORAGE_PAD,
+       SK_DIAG_BPF_STORAGE_MAP_ID,
+       SK_DIAG_BPF_STORAGE_MAP_VALUE,
+       __SK_DIAG_BPF_STORAGE_MAX,
+};
+
+#define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
index a79743a89815814f1aa3397e1fa257f8a37e5b12..c536c65256add9d7cd15569d93d5931d1ab7e8e2 100644 (file)
@@ -902,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map)
 }
 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
 
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+       struct fd f = fdget(ufd);
+       struct bpf_map *map;
+
+       map = __bpf_map_get(f);
+       if (IS_ERR(map))
+               return map;
+
+       bpf_map_inc(map);
+       fdput(f);
+
+       return map;
+}
+
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
        struct fd f = fdget(ufd);
index 3ab23f698221cf1325f74c7ea20fcb007837b415..3415a4896c59925fa4d6324ecb35f5087eb83e3d 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/bpf.h>
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
 
 static atomic_t cache_idx;
@@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
        kfree(map);
 }
 
+/* U16_MAX is much more than enough for sk local storage
+ * considering a tcp_sock is ~2k.
+ */
+#define MAX_VALUE_SIZE                                                 \
+       min_t(u32,                                                      \
+             (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
+             (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
+
 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 {
        if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
@@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       if (attr->value_size >= KMALLOC_MAX_SIZE -
-           MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
-           /* U16_MAX is much more than enough for sk local storage
-            * considering a tcp_sock is ~2k.
-            */
-           attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
+       if (attr->value_size > MAX_VALUE_SIZE)
                return -E2BIG;
 
        return 0;
@@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_PTR_TO_SOCKET,
 };
+
+struct bpf_sk_storage_diag {
+       u32 nr_maps;
+       struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ *     SK_DIAG_BPF_STORAGE (nla_nest)
+ *             SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ *             SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ *     SK_DIAG_BPF_STORAGE (nla_nest)
+ *             SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ *             SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ *     ....
+ */
+static int nla_value_size(u32 value_size)
+{
+       /* SK_DIAG_BPF_STORAGE (nla_nest)
+        *      SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+        *      SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+        */
+       return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+               nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+       u32 i;
+
+       if (!diag)
+               return;
+
+       for (i = 0; i < diag->nr_maps; i++)
+               bpf_map_put(diag->maps[i]);
+
+       kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+                          const struct bpf_map *map)
+{
+       u32 i;
+
+       for (i = 0; i < diag->nr_maps; i++) {
+               if (diag->maps[i] == map)
+                       return true;
+       }
+
+       return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+       struct bpf_sk_storage_diag *diag;
+       struct nlattr *nla;
+       u32 nr_maps = 0;
+       int rem, err;
+
+       /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+        * the map_alloc_check() side also does.
+        */
+       if (!capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       nla_for_each_nested(nla, nla_stgs, rem) {
+               if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+                       nr_maps++;
+       }
+
+       diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
+                      GFP_KERNEL);
+       if (!diag)
+               return ERR_PTR(-ENOMEM);
+
+       nla_for_each_nested(nla, nla_stgs, rem) {
+               struct bpf_map *map;
+               int map_fd;
+
+               if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+                       continue;
+
+               map_fd = nla_get_u32(nla);
+               map = bpf_map_get(map_fd);
+               if (IS_ERR(map)) {
+                       err = PTR_ERR(map);
+                       goto err_free;
+               }
+               if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+                       bpf_map_put(map);
+                       err = -EINVAL;
+                       goto err_free;
+               }
+               if (diag_check_dup(diag, map)) {
+                       bpf_map_put(map);
+                       err = -EEXIST;
+                       goto err_free;
+               }
+               diag->maps[diag->nr_maps++] = map;
+       }
+
+       return diag;
+
+err_free:
+       bpf_sk_storage_diag_free(diag);
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+{
+       struct nlattr *nla_stg, *nla_value;
+       struct bpf_sk_storage_map *smap;
+
+       /* It cannot exceed max nlattr's payload */
+       BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+
+       nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+       if (!nla_stg)
+               return -EMSGSIZE;
+
+       smap = rcu_dereference(sdata->smap);
+       if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+               goto errout;
+
+       nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+                                     smap->map.value_size,
+                                     SK_DIAG_BPF_STORAGE_PAD);
+       if (!nla_value)
+               goto errout;
+
+       if (map_value_has_spin_lock(&smap->map))
+               copy_map_value_locked(&smap->map, nla_data(nla_value),
+                                     sdata->data, true);
+       else
+               copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+       nla_nest_end(skb, nla_stg);
+       return 0;
+
+errout:
+       nla_nest_cancel(skb, nla_stg);
+       return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+                                      int stg_array_type,
+                                      unsigned int *res_diag_size)
+{
+       /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+       unsigned int diag_size = nla_total_size(0);
+       struct bpf_sk_storage *sk_storage;
+       struct bpf_sk_storage_elem *selem;
+       struct bpf_sk_storage_map *smap;
+       struct nlattr *nla_stgs;
+       unsigned int saved_len;
+       int err = 0;
+
+       rcu_read_lock();
+
+       sk_storage = rcu_dereference(sk->sk_bpf_storage);
+       if (!sk_storage || hlist_empty(&sk_storage->list)) {
+               rcu_read_unlock();
+               return 0;
+       }
+
+       nla_stgs = nla_nest_start(skb, stg_array_type);
+       if (!nla_stgs)
+               /* Continue to learn diag_size */
+               err = -EMSGSIZE;
+
+       saved_len = skb->len;
+       hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+               smap = rcu_dereference(SDATA(selem)->smap);
+               diag_size += nla_value_size(smap->map.value_size);
+
+               if (nla_stgs && diag_get(SDATA(selem), skb))
+                       /* Continue to learn diag_size */
+                       err = -EMSGSIZE;
+       }
+
+       rcu_read_unlock();
+
+       if (nla_stgs) {
+               if (saved_len == skb->len)
+                       nla_nest_cancel(skb, nla_stgs);
+               else
+                       nla_nest_end(skb, nla_stgs);
+       }
+
+       if (diag_size == nla_total_size(0)) {
+               *res_diag_size = 0;
+               return 0;
+       }
+
+       *res_diag_size = diag_size;
+       return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+                           struct sock *sk, struct sk_buff *skb,
+                           int stg_array_type,
+                           unsigned int *res_diag_size)
+{
+       /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+       unsigned int diag_size = nla_total_size(0);
+       struct bpf_sk_storage *sk_storage;
+       struct bpf_sk_storage_data *sdata;
+       struct nlattr *nla_stgs;
+       unsigned int saved_len;
+       int err = 0;
+       u32 i;
+
+       *res_diag_size = 0;
+
+       /* No map has been specified.  Dump all. */
+       if (!diag->nr_maps)
+               return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+                                                  res_diag_size);
+
+       rcu_read_lock();
+       sk_storage = rcu_dereference(sk->sk_bpf_storage);
+       if (!sk_storage || hlist_empty(&sk_storage->list)) {
+               rcu_read_unlock();
+               return 0;
+       }
+
+       nla_stgs = nla_nest_start(skb, stg_array_type);
+       if (!nla_stgs)
+               /* Continue to learn diag_size */
+               err = -EMSGSIZE;
+
+       saved_len = skb->len;
+       for (i = 0; i < diag->nr_maps; i++) {
+               sdata = __sk_storage_lookup(sk_storage,
+                               (struct bpf_sk_storage_map *)diag->maps[i],
+                               false);
+
+               if (!sdata)
+                       continue;
+
+               diag_size += nla_value_size(diag->maps[i]->value_size);
+
+               if (nla_stgs && diag_get(sdata, skb))
+                       /* Continue to learn diag_size */
+                       err = -EMSGSIZE;
+       }
+       rcu_read_unlock();
+
+       if (nla_stgs) {
+               if (saved_len == skb->len)
+                       nla_nest_cancel(skb, nla_stgs);
+               else
+                       nla_nest_end(skb, nla_stgs);
+       }
+
+       if (diag_size == nla_total_size(0)) {
+               *res_diag_size = 0;
+               return 0;
+       }
+
+       *res_diag_size = diag_size;
+       return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);