Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorDavid S. Miller <davem@davemloft.net>
Wed, 10 Mar 2021 02:07:05 +0000 (18:07 -0800)
committerDavid S. Miller <davem@davemloft.net>
Wed, 10 Mar 2021 02:07:05 +0000 (18:07 -0800)
Alexei Starovoitov says:

====================
pull-request: bpf-next 2021-03-09

The following pull-request contains BPF updates for your *net-next* tree.

We've added 90 non-merge commits during the last 17 day(s) which contain
a total of 114 files changed, 5158 insertions(+), 1288 deletions(-).

The main changes are:

1) Faster bpf_redirect_map(), from Björn.

2) skmsg cleanup, from Cong.

3) Support for floating point types in BTF, from Ilya.

4) Documentation for sys_bpf commands, from Joe.

5) Support for sk_lookup in bpf_prog_test_run, form Lorenz.

6) Enable task local storage for tracing programs, from Song.

7) bpf_for_each_map_elem() helper, from Yonghong.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
114 files changed:
Documentation/bpf/btf.rst
Documentation/bpf/index.rst
Documentation/userspace-api/ebpf/index.rst [new file with mode: 0644]
Documentation/userspace-api/ebpf/syscall.rst [new file with mode: 0644]
Documentation/userspace-api/index.rst
MAINTAINERS
drivers/net/virtio_net.c
include/linux/bpf.h
include/linux/bpf_local_storage.h
include/linux/bpf_lsm.h
include/linux/bpf_types.h
include/linux/bpf_verifier.h
include/linux/filter.h
include/linux/netdevice.h
include/linux/sched.h
include/linux/skbuff.h
include/linux/skmsg.h
include/net/tcp.h
include/net/udp.h
include/net/xdp_sock.h
include/trace/events/xdp.h
include/uapi/linux/bpf.h
include/uapi/linux/btf.h
init/Kconfig
kernel/bpf/Makefile
kernel/bpf/arraymap.c
kernel/bpf/bpf_inode_storage.c
kernel/bpf/bpf_iter.c
kernel/bpf/bpf_local_storage.c
kernel/bpf/bpf_lsm.c
kernel/bpf/bpf_task_storage.c
kernel/bpf/btf.c
kernel/bpf/cpumap.c
kernel/bpf/devmap.c
kernel/bpf/hashtab.c
kernel/bpf/helpers.c
kernel/bpf/verifier.c
kernel/fork.c
kernel/trace/bpf_trace.c
net/Kconfig
net/bpf/test_run.c
net/core/Makefile
net/core/bpf_sk_storage.c
net/core/filter.c
net/core/skmsg.c
net/core/sock_map.c
net/ipv4/Makefile
net/ipv4/tcp_bpf.c
net/xdp/xsk.c
net/xdp/xsk_queue.h
net/xdp/xskmap.c
scripts/bpf_doc.py [moved from scripts/bpf_helpers_doc.py with 82% similarity]
tools/bpf/Makefile.helpers [deleted file]
tools/bpf/bpf_dbg.c
tools/bpf/bpf_exp.y
tools/bpf/bpftool/.gitignore
tools/bpf/bpftool/Documentation/Makefile
tools/bpf/bpftool/btf.c
tools/bpf/bpftool/btf_dumper.c
tools/bpf/bpftool/feature.c
tools/bpf/bpftool/xlated_dumper.c
tools/bpf/runqslower/Makefile
tools/bpf/runqslower/runqslower.bpf.c
tools/include/uapi/linux/bpf.h
tools/include/uapi/linux/btf.h
tools/lib/bpf/Makefile
tools/lib/bpf/btf.c
tools/lib/bpf/btf.h
tools/lib/bpf/btf_dump.c
tools/lib/bpf/libbpf.c
tools/lib/bpf/libbpf.map
tools/lib/bpf/libbpf_internal.h
tools/lib/bpf/libbpf_util.h
tools/lib/bpf/xsk.h
tools/perf/MANIFEST
tools/testing/selftests/bpf/.gitignore
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/Makefile.docs [new file with mode: 0644]
tools/testing/selftests/bpf/README.rst
tools/testing/selftests/bpf/btf_helpers.c
tools/testing/selftests/bpf/prog_tests/attach_probe.c
tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
tools/testing/selftests/bpf/prog_tests/btf.c
tools/testing/selftests/bpf/prog_tests/core_reloc.c
tools/testing/selftests/bpf/prog_tests/for_each.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
tools/testing/selftests/bpf/prog_tests/sk_lookup.c
tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
tools/testing/selftests/bpf/prog_tests/task_local_storage.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
tools/testing/selftests/bpf/progs/core_reloc_types.h
tools/testing/selftests/bpf/progs/for_each_array_map_elem.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/loop6.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/task_local_storage.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/task_ls_recursion.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_core_reloc_size.c
tools/testing/selftests/bpf/progs/test_sk_lookup.c
tools/testing/selftests/bpf/progs/test_sockmap_listen.c
tools/testing/selftests/bpf/progs/test_tc_tunnel.c
tools/testing/selftests/bpf/test_bpftool_build.sh
tools/testing/selftests/bpf/test_btf.h
tools/testing/selftests/bpf/test_doc_build.sh [new file with mode: 0755]
tools/testing/selftests/bpf/test_progs.h
tools/testing/selftests/bpf/test_sockmap.c
tools/testing/selftests/bpf/test_tc_tunnel.sh
tools/testing/selftests/bpf/test_verifier.c
tools/testing/selftests/bpf/test_xsk.sh
tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
tools/testing/selftests/bpf/vmtest.sh
tools/testing/selftests/bpf/xdpxceiver.c
tools/testing/selftests/bpf/xdpxceiver.h
tools/testing/selftests/bpf/xsk_prereqs.sh

index 44dc789..846354c 100644 (file)
@@ -84,6 +84,7 @@ sequentially and type id is assigned to each recognized type starting from id
     #define BTF_KIND_FUNC_PROTO     13      /* Function Proto       */
     #define BTF_KIND_VAR            14      /* Variable     */
     #define BTF_KIND_DATASEC        15      /* Section      */
+    #define BTF_KIND_FLOAT          16      /* Floating point       */
 
 Note that the type section encodes debug info, not just pure types.
 ``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram.
@@ -95,8 +96,8 @@ Each type contains the following common data::
         /* "info" bits arrangement
          * bits  0-15: vlen (e.g. # of struct's members)
          * bits 16-23: unused
-         * bits 24-27: kind (e.g. int, ptr, array...etc)
-         * bits 28-30: unused
+         * bits 24-28: kind (e.g. int, ptr, array...etc)
+         * bits 29-30: unused
          * bit     31: kind_flag, currently used by
          *             struct, union and fwd
          */
@@ -452,6 +453,18 @@ map definition.
   * ``offset``: the in-section offset of the variable
   * ``size``: the size of the variable in bytes
 
+2.2.16 BTF_KIND_FLOAT
+~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+ * ``name_off``: any valid offset
+ * ``info.kind_flag``: 0
+ * ``info.kind``: BTF_KIND_FLOAT
+ * ``info.vlen``: 0
+ * ``size``: the size of the float type in bytes: 2, 4, 8, 12 or 16.
+
+No additional type data follow ``btf_type``.
+
 3. BTF Kernel API
 *****************
 
index 4f2874b..a702f67 100644 (file)
@@ -12,9 +12,6 @@ BPF instruction-set.
 The Cilium project also maintains a `BPF and XDP Reference Guide`_
 that goes into great technical depth about the BPF Architecture.
 
-The primary info for the bpf syscall is available in the `man-pages`_
-for `bpf(2)`_.
-
 BPF Type Format (BTF)
 =====================
 
@@ -35,6 +32,12 @@ Two sets of Questions and Answers (Q&A) are maintained.
    bpf_design_QA
    bpf_devel_QA
 
+Syscall API
+===========
+
+The primary info for the bpf syscall is available in the `man-pages`_
+for `bpf(2)`_. For more information about the userspace API, see
+Documentation/userspace-api/ebpf/index.rst.
 
 Helper functions
 ================
diff --git a/Documentation/userspace-api/ebpf/index.rst b/Documentation/userspace-api/ebpf/index.rst
new file mode 100644 (file)
index 0000000..473dfba
--- /dev/null
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+eBPF Userspace API
+==================
+
+eBPF is a kernel mechanism to provide a sandboxed runtime environment in the
+Linux kernel for runtime extension and instrumentation without changing kernel
+source code or loading kernel modules. eBPF programs can be attached to various
+kernel subsystems, including networking, tracing and Linux security modules
+(LSM).
+
+For internal kernel documentation on eBPF, see Documentation/bpf/index.rst.
+
+.. toctree::
+   :maxdepth: 1
+
+   syscall
diff --git a/Documentation/userspace-api/ebpf/syscall.rst b/Documentation/userspace-api/ebpf/syscall.rst
new file mode 100644 (file)
index 0000000..ea99180
--- /dev/null
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+eBPF Syscall
+------------
+
+:Authors: - Alexei Starovoitov <ast@kernel.org>
+          - Joe Stringer <joe@wand.net.nz>
+          - Michael Kerrisk <mtk.manpages@gmail.com>
+
+The primary info for the bpf syscall is available in the `man-pages`_
+for `bpf(2)`_.
+
+bpf() subcommand reference
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: include/uapi/linux/bpf.h
+   :doc: eBPF Syscall Preamble
+
+.. kernel-doc:: include/uapi/linux/bpf.h
+   :doc: eBPF Syscall Commands
+
+.. Links:
+.. _man-pages: https://www.kernel.org/doc/man-pages/
+.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html
index d29b020..1e2438b 100644 (file)
@@ -21,6 +21,7 @@ place where this information is gathered.
    unshare
    spec_ctrl
    accelerators/ocxl
+   ebpf/index
    ioctl/index
    iommu
    media/index
index f2e34ed..ed9fd1f 100644 (file)
@@ -3233,6 +3233,7 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
 F:     Documentation/bpf/
 F:     Documentation/networking/filter.rst
+F:     Documentation/userspace-api/ebpf/
 F:     arch/*/net/*
 F:     include/linux/bpf*
 F:     include/linux/filter.h
@@ -3247,6 +3248,7 @@ F:        net/core/filter.c
 F:     net/sched/act_bpf.c
 F:     net/sched/cls_bpf.c
 F:     samples/bpf/
+F:     scripts/bpf_doc.py
 F:     tools/bpf/
 F:     tools/lib/bpf/
 F:     tools/testing/selftests/bpf/
index 82e520d..708a8b2 100644 (file)
@@ -2973,7 +2973,8 @@ static int virtnet_probe(struct virtio_device *vdev)
                return -ENOMEM;
 
        /* Set up network device as normal. */
-       dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
+       dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
+                          IFF_TX_SKB_NO_LINEAR;
        dev->netdev_ops = &virtnet_netdev;
        dev->features = NETIF_F_HIGHDMA;
 
index cccaef1..a25730e 100644 (file)
@@ -39,6 +39,7 @@ struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
 struct mem_cgroup;
+struct bpf_func_state;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -117,6 +118,9 @@ struct bpf_map_ops {
                                           void *owner, u32 size);
        struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
+       /* Misc helpers.*/
+       int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags);
+
        /* map_meta_equal must be implemented for maps that can be
         * used as an inner map.  It is a runtime check to ensure
         * an inner map can be inserted to an outer map.
@@ -129,6 +133,13 @@ struct bpf_map_ops {
        bool (*map_meta_equal)(const struct bpf_map *meta0,
                               const struct bpf_map *meta1);
 
+
+       int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
+                                             struct bpf_func_state *caller,
+                                             struct bpf_func_state *callee);
+       int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+                                    void *callback_ctx, u64 flags);
+
        /* BTF name and id of struct allocated by map_alloc */
        const char * const map_btf_name;
        int *map_btf_id;
@@ -295,6 +306,8 @@ enum bpf_arg_type {
        ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested */
        ARG_PTR_TO_BTF_ID_SOCK_COMMON,  /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
        ARG_PTR_TO_PERCPU_BTF_ID,       /* pointer to in-kernel percpu type */
+       ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
+       ARG_PTR_TO_STACK_OR_NULL,       /* pointer to stack or NULL */
        __BPF_ARG_TYPE_MAX,
 };
 
@@ -411,6 +424,8 @@ enum bpf_reg_type {
        PTR_TO_RDWR_BUF,         /* reg points to a read/write buffer */
        PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
        PTR_TO_PERCPU_BTF_ID,    /* reg points to a percpu kernel variable */
+       PTR_TO_FUNC,             /* reg points to a bpf program function */
+       PTR_TO_MAP_KEY,          /* reg points to a map element key */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -506,6 +521,11 @@ enum bpf_cgroup_storage_type {
  */
 #define MAX_BPF_FUNC_ARGS 12
 
+/* The maximum number of arguments passed through registers
+ * a single function may have.
+ */
+#define MAX_BPF_FUNC_REG_ARGS 5
+
 struct btf_func_model {
        u8 ret_size;
        u8 nr_args;
@@ -1380,6 +1400,10 @@ void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
 int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
                                struct bpf_link_info *info);
 
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+                                  struct bpf_func_state *caller,
+                                  struct bpf_func_state *callee);
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
@@ -1429,9 +1453,9 @@ struct btf *bpf_get_btf_vmlinux(void);
 /* Map specifics */
 struct xdp_buff;
 struct sk_buff;
+struct bpf_dtab_netdev;
+struct bpf_cpu_map_entry;
 
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_flush(void);
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
                    struct net_device *dev_rx);
@@ -1441,7 +1465,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
                             struct bpf_prog *xdp_prog);
 bool dev_map_can_have_prog(struct bpf_map *map);
 
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
                    struct net_device *dev_rx);
@@ -1470,6 +1493,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
                             const union bpf_attr *kattr,
                             union bpf_attr __user *uattr);
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+                               const union bpf_attr *kattr,
+                               union bpf_attr __user *uattr);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info);
@@ -1499,6 +1525,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+void bpf_task_storage_free(struct task_struct *task);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1568,17 +1595,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
        return -EOPNOTSUPP;
 }
 
-static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
-                                                      u32 key)
-{
-       return NULL;
-}
-
-static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
-                                                            u32 key)
-{
-       return NULL;
-}
 static inline bool dev_map_can_have_prog(struct bpf_map *map)
 {
        return false;
@@ -1590,6 +1606,7 @@ static inline void __dev_flush(void)
 
 struct xdp_buff;
 struct bpf_dtab_netdev;
+struct bpf_cpu_map_entry;
 
 static inline
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
@@ -1614,12 +1631,6 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
        return 0;
 }
 
-static inline
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-       return NULL;
-}
-
 static inline void __cpu_map_flush(void)
 {
 }
@@ -1670,6 +1681,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
        return -ENOTSUPP;
 }
 
+static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+                                             const union bpf_attr *kattr,
+                                             union bpf_attr __user *uattr)
+{
+       return -ENOTSUPP;
+}
+
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
@@ -1684,6 +1702,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 {
        return NULL;
 }
+
+static inline void bpf_task_storage_free(struct task_struct *task)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -1768,22 +1790,24 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 }
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
-#if defined(CONFIG_BPF_STREAM_PARSER)
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-                        struct bpf_prog *old, u32 which);
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
 void sock_map_unhash(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
+
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+                                      void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+                                      void *value, u64 map_flags);
 #else
-static inline int sock_map_prog_update(struct bpf_map *map,
-                                      struct bpf_prog *prog,
-                                      struct bpf_prog *old, u32 which)
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
 {
-       return -EOPNOTSUPP;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
 static inline int sock_map_get_from_fd(const union bpf_attr *attr,
                                       struct bpf_prog *prog)
 {
@@ -1801,20 +1825,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void
 {
        return -EOPNOTSUPP;
 }
-#endif /* CONFIG_BPF_STREAM_PARSER */
 
-#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-void bpf_sk_reuseport_detach(struct sock *sk);
-int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
-                                      void *value);
-int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
-                                      void *value, u64 map_flags);
-#else
-static inline void bpf_sk_reuseport_detach(struct sock *sk)
-{
-}
-
-#ifdef CONFIG_BPF_SYSCALL
 static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
                                                     void *key, void *value)
 {
@@ -1886,6 +1897,9 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
 extern const struct bpf_func_proto bpf_sock_from_file_proto;
 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
        enum bpf_func_id func_id, const struct bpf_prog *prog);
index b2c9463..b902c58 100644 (file)
@@ -126,7 +126,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
                         struct bpf_local_storage_map *smap,
                         bool cacheit_lockit);
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+                               int __percpu *busy_counter);
 
 int bpf_local_storage_map_check_btf(const struct bpf_map *map,
                                    const struct btf *btf,
index 0d1c33a..479c101 100644 (file)
@@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode(
        return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
 }
 
-static inline struct bpf_storage_blob *bpf_task(
-       const struct task_struct *task)
-{
-       if (unlikely(!task->security))
-               return NULL;
-
-       return task->security + bpf_lsm_blob_sizes.lbs_task;
-}
-
 extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
-extern const struct bpf_func_proto bpf_task_storage_get_proto;
-extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
-void bpf_task_storage_free(struct task_struct *task);
 
 #else /* !CONFIG_BPF_LSM */
 
@@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode(
        return NULL;
 }
 
-static inline struct bpf_storage_blob *bpf_task(
-       const struct task_struct *task)
-{
-       return NULL;
-}
-
 static inline void bpf_inode_storage_free(struct inode *inode)
 {
 }
 
-static inline void bpf_task_storage_free(struct task_struct *task)
-{
-}
-
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
index 99f7fd6..f883f01 100644 (file)
@@ -103,19 +103,17 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
-#if defined(CONFIG_BPF_STREAM_PARSER)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
-#endif
 #ifdef CONFIG_BPF_LSM
 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 #endif
 #ifdef CONFIG_INET
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
 #endif
 #endif
index 971b33a..51c2ffa 100644 (file)
@@ -68,6 +68,8 @@ struct bpf_reg_state {
                        unsigned long raw1;
                        unsigned long raw2;
                } raw;
+
+               u32 subprogno; /* for PTR_TO_FUNC */
        };
        /* For PTR_TO_PACKET, used to find other pointers with the same variable
         * offset, so they can share range knowledge.
@@ -204,6 +206,7 @@ struct bpf_func_state {
        int acquired_refs;
        struct bpf_reference_state *refs;
        int allocated_stack;
+       bool in_callback_fn;
        struct bpf_stack_state *stack;
 };
 
index 3b00fc9..b2b85b2 100644 (file)
@@ -646,7 +646,8 @@ struct bpf_redirect_info {
        u32 flags;
        u32 tgt_index;
        void *tgt_value;
-       struct bpf_map *map;
+       u32 map_id;
+       enum bpf_map_type map_type;
        u32 kern_flags;
        struct bpf_nh_params nh;
 };
@@ -1472,4 +1473,32 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
+static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
+                                                 void *lookup_elem(struct bpf_map *map, u32 key))
+{
+       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+       /* Lower bits of the flags are used as return code on lookup failure */
+       if (unlikely(flags > XDP_TX))
+               return XDP_ABORTED;
+
+       ri->tgt_value = lookup_elem(map, ifindex);
+       if (unlikely(!ri->tgt_value)) {
+               /* If the lookup fails we want to clear out the state in the
+                * redirect_info struct completely, so that if an eBPF program
+                * performs multiple lookups, the last one always takes
+                * precedence.
+                */
+               ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
+               ri->map_type = BPF_MAP_TYPE_UNSPEC;
+               return flags;
+       }
+
+       ri->tgt_index = ifindex;
+       ri->map_id = map->id;
+       ri->map_type = map->map_type;
+
+       return XDP_REDIRECT;
+}
+
 #endif /* __LINUX_FILTER_H__ */
index 5b67ea8..b379d08 100644 (file)
@@ -1518,6 +1518,8 @@ struct net_device_ops {
  * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
  * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
+ * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
+ *     skb_headlen(skb) == 0 (data starts from frag0)
  */
 enum netdev_priv_flags {
        IFF_802_1Q_VLAN                 = 1<<0,
@@ -1551,6 +1553,7 @@ enum netdev_priv_flags {
        IFF_FAILOVER_SLAVE              = 1<<28,
        IFF_L3MDEV_RX_HANDLER           = 1<<29,
        IFF_LIVE_RENAME_OK              = 1<<30,
+       IFF_TX_SKB_NO_LINEAR            = 1<<31,
 };
 
 #define IFF_802_1Q_VLAN                        IFF_802_1Q_VLAN
@@ -1577,12 +1580,14 @@ enum netdev_priv_flags {
 #define IFF_L3MDEV_SLAVE               IFF_L3MDEV_SLAVE
 #define IFF_TEAM                       IFF_TEAM
 #define IFF_RXFH_CONFIGURED            IFF_RXFH_CONFIGURED
+#define IFF_PHONY_HEADROOM             IFF_PHONY_HEADROOM
 #define IFF_MACSEC                     IFF_MACSEC
 #define IFF_NO_RX_HANDLER              IFF_NO_RX_HANDLER
 #define IFF_FAILOVER                   IFF_FAILOVER
 #define IFF_FAILOVER_SLAVE             IFF_FAILOVER_SLAVE
 #define IFF_L3MDEV_RX_HANDLER          IFF_L3MDEV_RX_HANDLER
 #define IFF_LIVE_RENAME_OK             IFF_LIVE_RENAME_OK
+#define IFF_TX_SKB_NO_LINEAR           IFF_TX_SKB_NO_LINEAR
 
 /* Specifies the type of the struct net_device::ml_priv pointer */
 enum netdev_ml_priv_type {
index ef00bb2..e5b7d90 100644 (file)
@@ -42,6 +42,7 @@ struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
+struct bpf_local_storage;
 struct capture_control;
 struct cfs_rq;
 struct fs_struct;
@@ -1351,6 +1352,10 @@ struct task_struct {
        /* Used by LSM modules for access restriction: */
        void                            *security;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+       /* Used by BPF task local storage */
+       struct bpf_local_storage __rcu  *bpf_storage;
+#endif
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
        unsigned long                   lowest_stack;
index 6d0a33d..0503c91 100644 (file)
@@ -656,6 +656,7 @@ typedef unsigned char *sk_buff_data_t;
  *     @protocol: Packet protocol from driver
  *     @destructor: Destruct function
  *     @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
+ *     @_sk_redir: socket redirection information for skmsg
  *     @_nfct: Associated connection, if any (with nfctinfo bits)
  *     @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *     @skb_iif: ifindex of device we arrived on
@@ -755,6 +756,9 @@ struct sk_buff {
                        void            (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
+#ifdef CONFIG_NET_SOCK_MSG
+               unsigned long           _sk_redir;
+#endif
        };
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
index 8edbbf5..6c09d94 100644 (file)
@@ -56,8 +56,8 @@ struct sk_msg {
 
 struct sk_psock_progs {
        struct bpf_prog                 *msg_parser;
-       struct bpf_prog                 *skb_parser;
-       struct bpf_prog                 *skb_verdict;
+       struct bpf_prog                 *stream_parser;
+       struct bpf_prog                 *stream_verdict;
 };
 
 enum sk_psock_state_bits {
@@ -70,12 +70,6 @@ struct sk_psock_link {
        void                            *link_raw;
 };
 
-struct sk_psock_parser {
-       struct strparser                strp;
-       bool                            enabled;
-       void (*saved_data_ready)(struct sock *sk);
-};
-
 struct sk_psock_work_state {
        struct sk_buff                  *skb;
        u32                             len;
@@ -90,7 +84,9 @@ struct sk_psock {
        u32                             eval;
        struct sk_msg                   *cork;
        struct sk_psock_progs           progs;
-       struct sk_psock_parser          parser;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+       struct strparser                strp;
+#endif
        struct sk_buff_head             ingress_skb;
        struct list_head                ingress_msg;
        unsigned long                   state;
@@ -100,6 +96,7 @@ struct sk_psock {
        void (*saved_unhash)(struct sock *sk);
        void (*saved_close)(struct sock *sk, long timeout);
        void (*saved_write_space)(struct sock *sk);
+       void (*saved_data_ready)(struct sock *sk);
        struct proto                    *sk_proto;
        struct sk_psock_work_state      work_state;
        struct work_struct              work;
@@ -305,9 +302,25 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
 
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+#else
+static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+}
+
+static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+}
+#endif
+
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
 
@@ -327,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link)
 
 struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
 
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
-
 static inline void sk_psock_cork_free(struct sk_psock *psock)
 {
        if (psock->cork) {
@@ -389,7 +400,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk)
        return psock;
 }
 
-void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
 
 static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
@@ -400,8 +410,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
 
 static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
 {
-       if (psock->parser.enabled)
-               psock->parser.saved_data_ready(sk);
+       if (psock->saved_data_ready)
+               psock->saved_data_ready(sk);
        else
                sk->sk_data_ready(sk);
 }
@@ -430,8 +440,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog,
 static inline void psock_progs_drop(struct sk_psock_progs *progs)
 {
        psock_set_prog(&progs->msg_parser, NULL);
-       psock_set_prog(&progs->skb_parser, NULL);
-       psock_set_prog(&progs->skb_verdict, NULL);
+       psock_set_prog(&progs->stream_parser, NULL);
+       psock_set_prog(&progs->stream_verdict, NULL);
 }
 
 int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
@@ -440,6 +450,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 {
        if (!psock)
                return false;
-       return psock->parser.enabled;
+       return !!psock->saved_data_ready;
+}
+
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+
+/* We only have one bit so far. */
+#define BPF_F_PTR_MASK ~(BPF_F_INGRESS)
+
+static inline bool skb_bpf_ingress(const struct sk_buff *skb)
+{
+       unsigned long sk_redir = skb->_sk_redir;
+
+       return sk_redir & BPF_F_INGRESS;
+}
+
+static inline void skb_bpf_set_ingress(struct sk_buff *skb)
+{
+       skb->_sk_redir |= BPF_F_INGRESS;
+}
+
+static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir,
+                                    bool ingress)
+{
+       skb->_sk_redir = (unsigned long)sk_redir;
+       if (ingress)
+               skb->_sk_redir |= BPF_F_INGRESS;
+}
+
+static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb)
+{
+       unsigned long sk_redir = skb->_sk_redir;
+
+       return (struct sock *)(sk_redir & BPF_F_PTR_MASK);
+}
+
+static inline void skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+       skb->_sk_redir = 0;
 }
+#endif /* CONFIG_NET_SOCK_MSG */
 #endif /* _LINUX_SKMSG_H */
index 963cd86..075de26 100644 (file)
@@ -883,36 +883,11 @@ struct tcp_skb_cb {
                        struct inet6_skb_parm   h6;
 #endif
                } header;       /* For incoming skbs */
-               struct {
-                       __u32 flags;
-                       struct sock *sk_redir;
-                       void *data_end;
-               } bpf;
        };
 };
 
 #define TCP_SKB_CB(__skb)      ((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
-static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
-{
-       TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
-}
-
-static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
-{
-       return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
-}
-
-static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
-{
-       return TCP_SKB_CB(skb)->bpf.sk_redir;
-}
-
-static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
-{
-       TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-}
-
 extern const struct inet_connection_sock_af_ops ipv4_specific;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -2222,25 +2197,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p,
        __MODULE_INFO(alias, alias_userspace, name);            \
        __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
 
+#ifdef CONFIG_NET_SOCK_MSG
 struct sk_msg;
 struct sk_psock;
 
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
 struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
-#else
-static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
-{
-}
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
 
-#ifdef CONFIG_NET_SOCK_MSG
 int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
                          int flags);
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
                      struct msghdr *msg, int len, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
+#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
+static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+}
+#endif
+
 #ifdef CONFIG_CGROUP_BPF
 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
index a132a02..d4d064c 100644 (file)
@@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
        return segs;
 }
 
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
 struct sk_psock;
 struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
-#endif /* BPF_STREAM_PARSER */
+#endif
 
 #endif /* _UDP_H */
index cc17bc9..9c0722c 100644 (file)
@@ -80,19 +80,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
 
-static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
-                                                    u32 key)
-{
-       struct xsk_map *m = container_of(map, struct xsk_map, map);
-       struct xdp_sock *xs;
-
-       if (key >= map->max_entries)
-               return NULL;
-
-       xs = READ_ONCE(m->xsk_map[key]);
-       return xs;
-}
-
 #else
 
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -109,12 +96,6 @@ static inline void __xsk_map_flush(void)
 {
 }
 
-static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
-                                                    u32 key)
-{
-       return NULL;
-}
-
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
index 76a9717..fcad364 100644 (file)
@@ -86,19 +86,15 @@ struct _bpf_dtab_netdev {
 };
 #endif /* __DEVMAP_OBJ_TYPE */
 
-#define devmap_ifindex(tgt, map)                               \
-       (((map->map_type == BPF_MAP_TYPE_DEVMAP ||      \
-                 map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \
-         ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0)
-
 DECLARE_EVENT_CLASS(xdp_redirect_template,
 
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
-                const struct bpf_map *map, u32 index),
+                enum bpf_map_type map_type,
+                u32 map_id, u32 index),
 
-       TP_ARGS(dev, xdp, tgt, err, map, index),
+       TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index),
 
        TP_STRUCT__entry(
                __field(int, prog_id)
@@ -111,14 +107,22 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
        ),
 
        TP_fast_assign(
+               u32 ifindex = 0, map_index = index;
+
+               if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+                       ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
+               } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+                       ifindex = index;
+                       map_index = 0;
+               }
+
                __entry->prog_id        = xdp->aux->id;
                __entry->act            = XDP_REDIRECT;
                __entry->ifindex        = dev->ifindex;
                __entry->err            = err;
-               __entry->to_ifindex     = map ? devmap_ifindex(tgt, map) :
-                                               index;
-               __entry->map_id         = map ? map->id : 0;
-               __entry->map_index      = map ? index : 0;
+               __entry->to_ifindex     = ifindex;
+               __entry->map_id         = map_id;
+               __entry->map_index      = map_index;
        ),
 
        TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
@@ -133,45 +137,49 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
-                const struct bpf_map *map, u32 index),
-       TP_ARGS(dev, xdp, tgt, err, map, index)
+                enum bpf_map_type map_type,
+                u32 map_id, u32 index),
+       TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
-                const struct bpf_map *map, u32 index),
-       TP_ARGS(dev, xdp, tgt, err, map, index)
+                enum bpf_map_type map_type,
+                u32 map_id, u32 index),
+       TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
-#define _trace_xdp_redirect(dev, xdp, to)                              \
-        trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to)
+#define _trace_xdp_redirect(dev, xdp, to)                                              \
+        trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to)
 
-#define _trace_xdp_redirect_err(dev, xdp, to, err)                     \
-        trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to)
+#define _trace_xdp_redirect_err(dev, xdp, to, err)                                     \
+        trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to)
 
-#define _trace_xdp_redirect_map(dev, xdp, to, map, index)              \
-        trace_xdp_redirect(dev, xdp, to, 0, map, index)
+#define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \
+        trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index)
 
-#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err)     \
-        trace_xdp_redirect_err(dev, xdp, to, err, map, index)
+#define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \
+        trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index)
 
 /* not used anymore, but kept around so as not to break old programs */
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
-                const struct bpf_map *map, u32 index),
-       TP_ARGS(dev, xdp, tgt, err, map, index)
+                enum bpf_map_type map_type,
+                u32 map_id, u32 index),
+       TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
-                const struct bpf_map *map, u32 index),
-       TP_ARGS(dev, xdp, tgt, err, map, index)
+                enum bpf_map_type map_type,
+                u32 map_id, u32 index),
+       TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
 TRACE_EVENT(xdp_cpumap_kthread,
index 79c8933..2d3036e 100644 (file)
@@ -93,7 +93,717 @@ union bpf_iter_link_info {
        } map;
 };
 
-/* BPF syscall commands, see bpf(2) man-page for details. */
+/* BPF syscall commands, see bpf(2) man-page for more details. */
+/**
+ * DOC: eBPF Syscall Preamble
+ *
+ * The operation to be performed by the **bpf**\ () system call is determined
+ * by the *cmd* argument. Each operation takes an accompanying argument,
+ * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
+ * below). The size argument is the size of the union pointed to by *attr*.
+ */
+/**
+ * DOC: eBPF Syscall Commands
+ *
+ * BPF_MAP_CREATE
+ *     Description
+ *             Create a map and return a file descriptor that refers to the
+ *             map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
+ *             is automatically enabled for the new file descriptor.
+ *
+ *             Applying **close**\ (2) to the file descriptor returned by
+ *             **BPF_MAP_CREATE** will delete the map (but see NOTES).
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_LOOKUP_ELEM
+ *     Description
+ *             Look up an element with a given *key* in the map referred to
+ *             by the file descriptor *map_fd*.
+ *
+ *             The *flags* argument may be specified as one of the
+ *             following:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up the value of a spin-locked map without
+ *                     returning the lock. This must be specified if the
+ *                     elements contain a spinlock.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_UPDATE_ELEM
+ *     Description
+ *             Create or update an element (key/value pair) in a specified map.
+ *
+ *             The *flags* argument should be specified as one of the
+ *             following:
+ *
+ *             **BPF_ANY**
+ *                     Create a new element or update an existing element.
+ *             **BPF_NOEXIST**
+ *                     Create a new element only if it did not exist.
+ *             **BPF_EXIST**
+ *                     Update an existing element.
+ *             **BPF_F_LOCK**
+ *                     Update a spin_lock-ed map element.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
+ *             **E2BIG**, **EEXIST**, or **ENOENT**.
+ *
+ *             **E2BIG**
+ *                     The number of elements in the map reached the
+ *                     *max_entries* limit specified at map creation time.
+ *             **EEXIST**
+ *                     If *flags* specifies **BPF_NOEXIST** and the element
+ *                     with *key* already exists in the map.
+ *             **ENOENT**
+ *                     If *flags* specifies **BPF_EXIST** and the element with
+ *                     *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_ELEM
+ *     Description
+ *             Look up and delete an element by key in a specified map.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_KEY
+ *     Description
+ *             Look up an element by key in a specified map and return the key
+ *             of the next element. Can be used to iterate over all elements
+ *             in the map.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             The following cases can be used to iterate over all elements of
+ *             the map:
+ *
+ *             * If *key* is not found, the operation returns zero and sets
+ *               the *next_key* pointer to the key of the first element.
+ *             * If *key* is found, the operation returns zero and sets the
+ *               *next_key* pointer to the key of the next element.
+ *             * If *key* is the last element, returns -1 and *errno* is set
+ *               to **ENOENT**.
+ *
+ *             May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
+ *             **EINVAL** on error.
+ *
+ * BPF_PROG_LOAD
+ *     Description
+ *             Verify and load an eBPF program, returning a new file
+ *             descriptor associated with the program.
+ *
+ *             Applying **close**\ (2) to the file descriptor returned by
+ *             **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
+ *
+ *             The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
+ *             automatically enabled for the new file descriptor.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_PIN
+ *     Description
+ *             Pin an eBPF program or map referred by the specified *bpf_fd*
+ *             to the provided *pathname* on the filesystem.
+ *
+ *             The *pathname* argument must not contain a dot (".").
+ *
+ *             On success, *pathname* retains a reference to the eBPF object,
+ *             preventing deallocation of the object when the original
+ *             *bpf_fd* is closed. This allow the eBPF object to live beyond
+ *             **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
+ *             process.
+ *
+ *             Applying **unlink**\ (2) or similar calls to the *pathname*
+ *             unpins the object from the filesystem, removing the reference.
+ *             If no other file descriptors or filesystem nodes refer to the
+ *             same object, it will be deallocated (see NOTES).
+ *
+ *             The filesystem type for the parent directory of *pathname* must
+ *             be **BPF_FS_MAGIC**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_OBJ_GET
+ *     Description
+ *             Open a file descriptor for the eBPF object pinned to the
+ *             specified *pathname*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_PROG_ATTACH
+ *     Description
+ *             Attach an eBPF program to a *target_fd* at the specified
+ *             *attach_type* hook.
+ *
+ *             The *attach_type* specifies the eBPF attachment point to
+ *             attach the program to, and must be one of *bpf_attach_type*
+ *             (see below).
+ *
+ *             The *attach_bpf_fd* must be a valid file descriptor for a
+ *             loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
+ *             or sock_ops type corresponding to the specified *attach_type*.
+ *
+ *             The *target_fd* must be a valid file descriptor for a kernel
+ *             object which depends on the attach type of *attach_bpf_fd*:
+ *
+ *             **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ *             **BPF_PROG_TYPE_CGROUP_SKB**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ *             **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ *             **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ *                     Control Group v2 hierarchy with the eBPF controller
+ *                     enabled. Requires the kernel to be compiled with
+ *                     **CONFIG_CGROUP_BPF**.
+ *
+ *             **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ *                     Network namespace (eg /proc/self/ns/net).
+ *
+ *             **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ *                     LIRC device path (eg /dev/lircN). Requires the kernel
+ *                     to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ *             **BPF_PROG_TYPE_SK_SKB**,
+ *             **BPF_PROG_TYPE_SK_MSG**
+ *
+ *                     eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_DETACH
+ *     Description
+ *             Detach the eBPF program associated with the *target_fd* at the
+ *             hook specified by *attach_type*. The program must have been
+ *             previously attached using **BPF_PROG_ATTACH**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_TEST_RUN
+ *     Description
+ *             Run the eBPF program associated with the *prog_fd* a *repeat*
+ *             number of times against a provided program context *ctx_in* and
+ *             data *data_in*, and return the modified program context
+ *             *ctx_out*, *data_out* (for example, packet data), result of the
+ *             execution *retval*, and *duration* of the test run.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             **ENOSPC**
+ *                     Either *data_size_out* or *ctx_size_out* is too small.
+ *             **ENOTSUPP**
+ *                     This command is not supported by the program type of
+ *                     the program referred to by *prog_fd*.
+ *
+ * BPF_PROG_GET_NEXT_ID
+ *     Description
+ *             Fetch the next eBPF program currently loaded into the kernel.
+ *
+ *             Looks for the eBPF program with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other eBPF programs
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_ID
+ *     Description
+ *             Fetch the next eBPF map currently loaded into the kernel.
+ *
+ *             Looks for the eBPF map with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other eBPF maps
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_PROG_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the eBPF program corresponding to
+ *             *prog_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the eBPF map corresponding to
+ *             *map_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_GET_INFO_BY_FD
+ *     Description
+ *             Obtain information about the eBPF object corresponding to
+ *             *bpf_fd*.
+ *
+ *             Populates up to *info_len* bytes of *info*, which will be in
+ *             one of the following formats depending on the eBPF object type
+ *             of *bpf_fd*:
+ *
+ *             * **struct bpf_prog_info**
+ *             * **struct bpf_map_info**
+ *             * **struct bpf_btf_info**
+ *             * **struct bpf_link_info**
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_QUERY
+ *     Description
+ *             Obtain information about eBPF programs associated with the
+ *             specified *attach_type* hook.
+ *
+ *             The *target_fd* must be a valid file descriptor for a kernel
+ *             object which depends on the attach type of *attach_bpf_fd*:
+ *
+ *             **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ *             **BPF_PROG_TYPE_CGROUP_SKB**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ *             **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ *             **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ *                     Control Group v2 hierarchy with the eBPF controller
+ *                     enabled. Requires the kernel to be compiled with
+ *                     **CONFIG_CGROUP_BPF**.
+ *
+ *             **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ *                     Network namespace (eg /proc/self/ns/net).
+ *
+ *             **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ *                     LIRC device path (eg /dev/lircN). Requires the kernel
+ *                     to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ *             **BPF_PROG_QUERY** always fetches the number of programs
+ *             attached and the *attach_flags* which were used to attach those
+ *             programs. Additionally, if *prog_ids* is nonzero and the number
+ *             of attached programs is less than *prog_cnt*, populates
+ *             *prog_ids* with the eBPF program ids of the programs attached
+ *             at *target_fd*.
+ *
+ *             The following flags may alter the result:
+ *
+ *             **BPF_F_QUERY_EFFECTIVE**
+ *                     Only return information regarding programs which are
+ *                     currently effective at the specified *target_fd*.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_RAW_TRACEPOINT_OPEN
+ *     Description
+ *             Attach an eBPF program to a tracepoint *name* to access kernel
+ *             internal arguments of the tracepoint in their raw form.
+ *
+ *             The *prog_fd* must be a valid file descriptor associated with
+ *             a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+ *
+ *             No ABI guarantees are made about the content of tracepoint
+ *             arguments exposed to the corresponding eBPF program.
+ *
+ *             Applying **close**\ (2) to the file descriptor returned by
+ *             **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_LOAD
+ *     Description
+ *             Verify and load BPF Type Format (BTF) metadata into the kernel,
+ *             returning a new file descriptor associated with the metadata.
+ *             BTF is described in more detail at
+ *             https://www.kernel.org/doc/html/latest/bpf/btf.html.
+ *
+ *             The *btf* parameter must point to valid memory providing
+ *             *btf_size* bytes of BTF binary metadata.
+ *
+ *             The returned file descriptor can be passed to other **bpf**\ ()
+ *             subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
+ *             associate the BTF with those objects.
+ *
+ *             Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
+ *             parameters to specify a *btf_log_buf*, *btf_log_size* and
+ *             *btf_log_level* which allow the kernel to return freeform log
+ *             output regarding the BTF verification process.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the BPF Type Format (BTF)
+ *             corresponding to *btf_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_TASK_FD_QUERY
+ *     Description
+ *             Obtain information about eBPF programs associated with the
+ *             target process identified by *pid* and *fd*.
+ *
+ *             If the *pid* and *fd* are associated with a tracepoint, kprobe
+ *             or uprobe perf event, then the *prog_id* and *fd_type* will
+ *             be populated with the eBPF program id and file descriptor type
+ *             of type **bpf_task_fd_type**. If associated with a kprobe or
+ *             uprobe, the  *probe_offset* and *probe_addr* will also be
+ *             populated. Optionally, if *buf* is provided, then up to
+ *             *buf_len* bytes of *buf* will be populated with the name of
+ *             the tracepoint, kprobe or uprobe.
+ *
+ *             The resulting *prog_id* may be introspected in deeper detail
+ *             using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_ELEM
+ *     Description
+ *             Look up an element with the given *key* in the map referred to
+ *             by the file descriptor *fd*, and if found, delete the element.
+ *
+ *             The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
+ *             implement this command as a "pop" operation, deleting the top
+ *             element rather than one corresponding to *key*.
+ *             The *key* and *key_len* parameters should be zeroed when
+ *             issuing this operation for these map types.
+ *
+ *             This command is only valid for the following map types:
+ *             * **BPF_MAP_TYPE_QUEUE**
+ *             * **BPF_MAP_TYPE_STACK**
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_FREEZE
+ *     Description
+ *             Freeze the permissions of the specified map.
+ *
+ *             Write permissions may be frozen by passing zero *flags*.
+ *             Upon success, no future syscall invocations may alter the
+ *             map state of *map_fd*. Write operations from eBPF programs
+ *             are still possible for a frozen map.
+ *
+ *             Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_BTF_GET_NEXT_ID
+ *     Description
+ *             Fetch the next BPF Type Format (BTF) object currently loaded
+ *             into the kernel.
+ *
+ *             Looks for the BTF object with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other BTF objects
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_BATCH
+ *     Description
+ *             Iterate and fetch multiple elements in a map.
+ *
+ *             Two opaque values are used to manage batch operations,
+ *             *in_batch* and *out_batch*. Initially, *in_batch* must be set
+ *             to NULL to begin the batched operation. After each subsequent
+ *             **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
+ *             *out_batch* as the *in_batch* for the next operation to
+ *             continue iteration from the current point.
+ *
+ *             The *keys* and *values* are output parameters which must point
+ *             to memory large enough to hold *count* items based on the key
+ *             and value size of the map *map_fd*. The *keys* buffer must be
+ *             of *key_size* * *count*. The *values* buffer must be of
+ *             *value_size* * *count*.
+ *
+ *             The *elem_flags* argument may be specified as one of the
+ *             following:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up the value of a spin-locked map without
+ *                     returning the lock. This must be specified if the
+ *                     elements contain a spinlock.
+ *
+ *             On success, *count* elements from the map are copied into the
+ *             user buffer, with the keys copied into *keys* and the values
+ *             copied into the corresponding indices in *values*.
+ *
+ *             If an error is returned and *errno* is not **EFAULT**, *count*
+ *             is set to the number of successfully processed elements.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             May set *errno* to **ENOSPC** to indicate that *keys* or
+ *             *values* is too small to dump an entire bucket during
+ *             iteration of a hash-based map type.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_BATCH
+ *     Description
+ *             Iterate and delete all elements in a map.
+ *
+ *             This operation has the same behavior as
+ *             **BPF_MAP_LOOKUP_BATCH** with two exceptions:
+ *
+ *             * Every element that is successfully returned is also deleted
+ *               from the map. This is at least *count* elements. Note that
+ *               *count* is both an input and an output parameter.
+ *             * Upon returning with *errno* set to **EFAULT**, up to
+ *               *count* elements may be deleted without returning the keys
+ *               and values of the deleted elements.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_UPDATE_BATCH
+ *     Description
+ *             Update multiple elements in a map by *key*.
+ *
+ *             The *keys* and *values* are input parameters which must point
+ *             to memory large enough to hold *count* items based on the key
+ *             and value size of the map *map_fd*. The *keys* buffer must be
+ *             of *key_size* * *count*. The *values* buffer must be of
+ *             *value_size* * *count*.
+ *
+ *             Each element specified in *keys* is sequentially updated to the
+ *             value in the corresponding index in *values*. The *in_batch*
+ *             and *out_batch* parameters are ignored and should be zeroed.
+ *
+ *             The *elem_flags* argument should be specified as one of the
+ *             following:
+ *
+ *             **BPF_ANY**
+ *                     Create new elements or update a existing elements.
+ *             **BPF_NOEXIST**
+ *                     Create new elements only if they do not exist.
+ *             **BPF_EXIST**
+ *                     Update existing elements.
+ *             **BPF_F_LOCK**
+ *                     Update spin_lock-ed map elements. This must be
+ *                     specified if the map value contains a spinlock.
+ *
+ *             On success, *count* elements from the map are updated.
+ *
+ *             If an error is returned and *errno* is not **EFAULT**, *count*
+ *             is set to the number of successfully processed elements.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
+ *             **E2BIG**. **E2BIG** indicates that the number of elements in
+ *             the map reached the *max_entries* limit specified at map
+ *             creation time.
+ *
+ *             May set *errno* to one of the following error codes under
+ *             specific circumstances:
+ *
+ *             **EEXIST**
+ *                     If *flags* specifies **BPF_NOEXIST** and the element
+ *                     with *key* already exists in the map.
+ *             **ENOENT**
+ *                     If *flags* specifies **BPF_EXIST** and the element with
+ *                     *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_BATCH
+ *     Description
+ *             Delete multiple elements in a map by *key*.
+ *
+ *             The *keys* parameter is an input parameter which must point
+ *             to memory large enough to hold *count* items based on the key
+ *             size of the map *map_fd*, that is, *key_size* * *count*.
+ *
+ *             Each element specified in *keys* is sequentially deleted. The
+ *             *in_batch*, *out_batch*, and *values* parameters are ignored
+ *             and should be zeroed.
+ *
+ *             The *elem_flags* argument may be specified as one of the
+ *             following:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up the value of a spin-locked map without
+ *                     returning the lock. This must be specified if the
+ *                     elements contain a spinlock.
+ *
+ *             On success, *count* elements from the map are updated.
+ *
+ *             If an error is returned and *errno* is not **EFAULT**, *count*
+ *             is set to the number of successfully processed elements. If
+ *             *errno* is **EFAULT**, up to *count* elements may be been
+ *             deleted.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_LINK_CREATE
+ *     Description
+ *             Attach an eBPF program to a *target_fd* at the specified
+ *             *attach_type* hook and return a file descriptor handle for
+ *             managing the link.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_UPDATE
+ *     Description
+ *             Update the eBPF program in the specified *link_fd* to
+ *             *new_prog_fd*.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_LINK_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the eBPF Link corresponding to
+ *             *link_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_GET_NEXT_ID
+ *     Description
+ *             Fetch the next eBPF link currently loaded into the kernel.
+ *
+ *             Looks for the eBPF link with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other eBPF links
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_ENABLE_STATS
+ *     Description
+ *             Enable eBPF runtime statistics gathering.
+ *
+ *             Runtime statistics gathering for the eBPF runtime is disabled
+ *             by default to minimize the corresponding performance overhead.
+ *             This command enables statistics globally.
+ *
+ *             Multiple programs may independently enable statistics.
+ *             After gathering the desired statistics, eBPF runtime statistics
+ *             may be disabled again by calling **close**\ (2) for the file
+ *             descriptor returned by this function. Statistics will only be
+ *             disabled system-wide when all outstanding file descriptors
+ *             returned by prior calls for this subcommand are closed.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_ITER_CREATE
+ *     Description
+ *             Create an iterator on top of the specified *link_fd* (as
+ *             previously created using **BPF_LINK_CREATE**) and return a
+ *             file descriptor that can be used to trigger the iteration.
+ *
+ *             If the resulting file descriptor is pinned to the filesystem
+ *             using  **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
+ *             for that path will trigger the iterator to read kernel state
+ *             using the eBPF program attached to *link_fd*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_DETACH
+ *     Description
+ *             Forcefully detach the specified *link_fd* from its
+ *             corresponding attachment point.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_BIND_MAP
+ *     Description
+ *             Bind a map to the lifetime of an eBPF program.
+ *
+ *             The map identified by *map_fd* is bound to the program
+ *             identified by *prog_fd* and only released when *prog_fd* is
+ *             released. This may be used in cases where metadata should be
+ *             associated with a program which otherwise does not contain any
+ *             references to the map (for example, embedded in the eBPF
+ *             program instructions).
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * NOTES
+ *     eBPF objects (maps and programs) can be shared between processes.
+ *
+ *     * After **fork**\ (2), the child inherits file descriptors
+ *       referring to the same eBPF objects.
+ *     * File descriptors referring to eBPF objects can be transferred over
+ *       **unix**\ (7) domain sockets.
+ *     * File descriptors referring to eBPF objects can be duplicated in the
+ *       usual way, using **dup**\ (2) and similar calls.
+ *     * File descriptors referring to eBPF objects can be pinned to the
+ *       filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
+ *
+ *     An eBPF object is deallocated only after all file descriptors referring
+ *     to the object have been closed and no references remain pinned to the
+ *     filesystem or attached (for example, bound to a program or device).
+ */
 enum bpf_cmd {
        BPF_MAP_CREATE,
        BPF_MAP_LOOKUP_ELEM,
@@ -393,6 +1103,15 @@ enum bpf_link_type {
  *                   is struct/union.
  */
 #define BPF_PSEUDO_BTF_ID      3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC                4
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -720,7 +1439,7 @@ union bpf_attr {
  * parsed and used to produce a manual page. The workflow is the following,
  * and requires the rst2man utility:
  *
- *     $ ./scripts/bpf_helpers_doc.py \
+ *     $ ./scripts/bpf_doc.py \
  *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
  *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
  *     $ man /tmp/bpf-helpers.7
@@ -1765,6 +2484,10 @@ union bpf_attr {
  *               Use with ENCAP_L3/L4 flags to further specify the tunnel
  *               type; *len* is the length of the inner MAC header.
  *
+ *             * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ *               Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ *               L2 type as Ethernet.
+ *
  *             A call to this helper is susceptible to change the underlying
  *             packet buffer. Therefore, at load time, all checks on pointers
  *             previously done by the verifier are invalidated and must be
@@ -3909,6 +4632,34 @@ union bpf_attr {
  *             * **BPF_MTU_CHK_RET_FRAG_NEEDED**
  *             * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
  *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *     Description
+ *             For each element in **map**, call **callback_fn** function with
+ *             **map**, **callback_ctx** and other map-specific parameters.
+ *             The **callback_fn** should be a static function and
+ *             the **callback_ctx** should be a pointer to the stack.
+ *             The **flags** is used to control certain aspects of the helper.
+ *             Currently, the **flags** must be 0.
+ *
+ *             The following are a list of supported map types and their
+ *             respective expected callback signatures:
+ *
+ *             BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *             BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *             BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *             long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *             For per_cpu maps, the map_value is the value on the cpu where the
+ *             bpf_prog is running.
+ *
+ *             If **callback_fn** return 0, the helper will continue to the next
+ *             element. If return value is 1, the helper will skip the rest of
+ *             elements and return. Other return values are not used now.
+ *
+ *     Return
+ *             The number of traversed map elements for success, **-EINVAL** for
+ *             invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -4075,6 +4826,7 @@ union bpf_attr {
        FN(ima_inode_hash),             \
        FN(sock_from_file),             \
        FN(check_mtu),                  \
+       FN(for_each_map_elem),          \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -4168,6 +4920,7 @@ enum {
        BPF_F_ADJ_ROOM_ENCAP_L4_GRE     = (1ULL << 3),
        BPF_F_ADJ_ROOM_ENCAP_L4_UDP     = (1ULL << 4),
        BPF_F_ADJ_ROOM_NO_CSUM_RESET    = (1ULL << 5),
+       BPF_F_ADJ_ROOM_ENCAP_L2_ETH     = (1ULL << 6),
 };
 
 enum {
@@ -5205,7 +5958,10 @@ struct bpf_pidns_info {
 
 /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
 struct bpf_sk_lookup {
-       __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+       union {
+               __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+               __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+       };
 
        __u32 family;           /* Protocol family (AF_INET, AF_INET6) */
        __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
index 5a66710..d27b170 100644 (file)
@@ -52,7 +52,7 @@ struct btf_type {
        };
 };
 
-#define BTF_INFO_KIND(info)    (((info) >> 24) & 0x0f)
+#define BTF_INFO_KIND(info)    (((info) >> 24) & 0x1f)
 #define BTF_INFO_VLEN(info)    ((info) & 0xffff)
 #define BTF_INFO_KFLAG(info)   ((info) >> 31)
 
@@ -72,7 +72,8 @@ struct btf_type {
 #define BTF_KIND_FUNC_PROTO    13      /* Function Proto       */
 #define BTF_KIND_VAR           14      /* Variable     */
 #define BTF_KIND_DATASEC       15      /* Section      */
-#define BTF_KIND_MAX           BTF_KIND_DATASEC
+#define BTF_KIND_FLOAT         16      /* Floating point       */
+#define BTF_KIND_MAX           BTF_KIND_FLOAT
 #define NR_BTF_KINDS           (BTF_KIND_MAX + 1)
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
index 22946fe..2c9cbd8 100644 (file)
@@ -1709,6 +1709,7 @@ config BPF_SYSCALL
        select BPF
        select IRQ_WORK
        select TASKS_TRACE_RCU
+       select NET_SOCK_MSG if INET
        default n
        help
          Enable the bpf() system call that allows to manipulate eBPF
index d124934..7f33098 100644 (file)
@@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}    += bpf_inode_storage.o
-obj-${CONFIG_BPF_LSM}    += bpf_task_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
 endif
index 1f84533..463d25e 100644 (file)
@@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = {
        .seq_priv_size          = sizeof(struct bpf_iter_seq_array_map_info),
 };
 
+static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
+                                  void *callback_ctx, u64 flags)
+{
+       u32 i, key, num_elems = 0;
+       struct bpf_array *array;
+       bool is_percpu;
+       u64 ret = 0;
+       void *val;
+
+       if (flags != 0)
+               return -EINVAL;
+
+       is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+       array = container_of(map, struct bpf_array, map);
+       if (is_percpu)
+               migrate_disable();
+       for (i = 0; i < map->max_entries; i++) {
+               if (is_percpu)
+                       val = this_cpu_ptr(array->pptrs[i]);
+               else
+                       val = array->value + array->elem_size * i;
+               num_elems++;
+               key = i;
+               ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+                                       (u64)(long)&key, (u64)(long)val,
+                                       (u64)(long)callback_ctx, 0);
+               /* return value: 0 - continue, 1 - stop and return */
+               if (ret)
+                       break;
+       }
+
+       if (is_percpu)
+               migrate_enable();
+       return num_elems;
+}
+
 static int array_map_btf_id;
 const struct bpf_map_ops array_map_ops = {
        .map_meta_equal = array_map_meta_equal,
@@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = {
        .map_check_btf = array_map_check_btf,
        .map_lookup_batch = generic_map_lookup_batch,
        .map_update_batch = generic_map_update_batch,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_for_each_array_elem,
        .map_btf_name = "bpf_array",
        .map_btf_id = &array_map_btf_id,
        .iter_seq_info = &iter_seq_info,
@@ -660,6 +698,8 @@ const struct bpf_map_ops percpu_array_map_ops = {
        .map_delete_elem = array_map_delete_elem,
        .map_seq_show_elem = percpu_array_map_seq_show_elem,
        .map_check_btf = array_map_check_btf,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_for_each_array_elem,
        .map_btf_name = "bpf_array",
        .map_btf_id = &percpu_array_map_btf_id,
        .iter_seq_info = &iter_seq_info,
index 6639640..da75372 100644 (file)
@@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map)
 
        smap = (struct bpf_local_storage_map *)map;
        bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
-       bpf_local_storage_map_free(smap);
+       bpf_local_storage_map_free(smap, NULL);
 }
 
 static int inode_storage_map_btf_id;
index a0d9ead..931870f 100644 (file)
@@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
         */
        return ret == 0 ? 0 : -EAGAIN;
 }
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+          void *, callback_ctx, u64, flags)
+{
+       return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+       .func           = bpf_for_each_map_elem,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_PTR_TO_FUNC,
+       .arg3_type      = ARG_PTR_TO_STACK_OR_NULL,
+       .arg4_type      = ARG_ANYTHING,
+};
index dd5aede..b305270 100644 (file)
@@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
 {
        struct bpf_local_storage *local_storage;
        bool free_local_storage = false;
+       unsigned long flags;
 
        if (unlikely(!selem_linked_to_storage(selem)))
                /* selem has already been unlinked from sk */
                return;
 
        local_storage = rcu_dereference(selem->local_storage);
-       raw_spin_lock_bh(&local_storage->lock);
+       raw_spin_lock_irqsave(&local_storage->lock, flags);
        if (likely(selem_linked_to_storage(selem)))
                free_local_storage = bpf_selem_unlink_storage_nolock(
                        local_storage, selem, true);
-       raw_spin_unlock_bh(&local_storage->lock);
+       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
        if (free_local_storage)
                kfree_rcu(local_storage, rcu);
@@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
        struct bpf_local_storage_map *smap;
        struct bpf_local_storage_map_bucket *b;
+       unsigned long flags;
 
        if (unlikely(!selem_linked_to_map(selem)))
                /* selem has already be unlinked from smap */
@@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 
        smap = rcu_dereference(SDATA(selem)->smap);
        b = select_bucket(smap, selem);
-       raw_spin_lock_bh(&b->lock);
+       raw_spin_lock_irqsave(&b->lock, flags);
        if (likely(selem_linked_to_map(selem)))
                hlist_del_init_rcu(&selem->map_node);
-       raw_spin_unlock_bh(&b->lock);
+       raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
                        struct bpf_local_storage_elem *selem)
 {
        struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+       unsigned long flags;
 
-       raw_spin_lock_bh(&b->lock);
+       raw_spin_lock_irqsave(&b->lock, flags);
        RCU_INIT_POINTER(SDATA(selem)->smap, smap);
        hlist_add_head_rcu(&selem->map_node, &b->list);
-       raw_spin_unlock_bh(&b->lock);
+       raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
@@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 
        sdata = SDATA(selem);
        if (cacheit_lockit) {
+               unsigned long flags;
+
                /* spinlock is needed to avoid racing with the
                 * parallel delete.  Otherwise, publishing an already
                 * deleted sdata to the cache will become a use-after-free
                 * problem in the next bpf_local_storage_lookup().
                 */
-               raw_spin_lock_bh(&local_storage->lock);
+               raw_spin_lock_irqsave(&local_storage->lock, flags);
                if (selem_linked_to_storage(selem))
                        rcu_assign_pointer(local_storage->cache[smap->cache_idx],
                                           sdata);
-               raw_spin_unlock_bh(&local_storage->lock);
+               raw_spin_unlock_irqrestore(&local_storage->lock, flags);
        }
 
        return sdata;
@@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
        struct bpf_local_storage_data *old_sdata = NULL;
        struct bpf_local_storage_elem *selem;
        struct bpf_local_storage *local_storage;
+       unsigned long flags;
        int err;
 
        /* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
                }
        }
 
-       raw_spin_lock_bh(&local_storage->lock);
+       raw_spin_lock_irqsave(&local_storage->lock, flags);
 
        /* Recheck local_storage->list under local_storage->lock */
        if (unlikely(hlist_empty(&local_storage->list))) {
@@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
        }
 
 unlock:
-       raw_spin_unlock_bh(&local_storage->lock);
+       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
        return SDATA(selem);
 
 unlock_err:
-       raw_spin_unlock_bh(&local_storage->lock);
+       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
        return ERR_PTR(err);
 }
 
@@ -468,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
        spin_unlock(&cache->idx_lock);
 }
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+                               int __percpu *busy_counter)
 {
        struct bpf_local_storage_elem *selem;
        struct bpf_local_storage_map_bucket *b;
@@ -497,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
                while ((selem = hlist_entry_safe(
                                rcu_dereference_raw(hlist_first_rcu(&b->list)),
                                struct bpf_local_storage_elem, map_node))) {
+                       if (busy_counter) {
+                               migrate_disable();
+                               __this_cpu_inc(*busy_counter);
+                       }
                        bpf_selem_unlink(selem);
+                       if (busy_counter) {
+                               __this_cpu_dec(*busy_counter);
+                               migrate_enable();
+                       }
                        cond_resched_rcu();
                }
                rcu_read_unlock();
index 1622a44..9829f38 100644 (file)
@@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_spin_lock_proto;
        case BPF_FUNC_spin_unlock:
                return &bpf_spin_unlock_proto;
-       case BPF_FUNC_task_storage_get:
-               return &bpf_task_storage_get_proto;
-       case BPF_FUNC_task_storage_delete:
-               return &bpf_task_storage_delete_proto;
        case BPF_FUNC_bprm_opts_set:
                return &bpf_bprm_opts_set_proto;
        case BPF_FUNC_ima_inode_hash:
index e0da025..fd3c74e 100644 (file)
 #include <linux/bpf_local_storage.h>
 #include <linux/filter.h>
 #include <uapi/linux/btf.h>
-#include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/fdtable.h>
 
 DEFINE_BPF_STORAGE_CACHE(task_cache);
 
+DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+       migrate_disable();
+       __this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+       __this_cpu_dec(bpf_task_storage_busy);
+       migrate_enable();
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+       migrate_disable();
+       if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+               __this_cpu_dec(bpf_task_storage_busy);
+               migrate_enable();
+               return false;
+       }
+       return true;
+}
+
 static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
 {
        struct task_struct *task = owner;
-       struct bpf_storage_blob *bsb;
 
-       bsb = bpf_task(task);
-       if (!bsb)
-               return NULL;
-       return &bsb->storage;
+       return &task->bpf_storage;
 }
 
 static struct bpf_local_storage_data *
@@ -38,13 +58,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 {
        struct bpf_local_storage *task_storage;
        struct bpf_local_storage_map *smap;
-       struct bpf_storage_blob *bsb;
-
-       bsb = bpf_task(task);
-       if (!bsb)
-               return NULL;
 
-       task_storage = rcu_dereference(bsb->storage);
+       task_storage = rcu_dereference(task->bpf_storage);
        if (!task_storage)
                return NULL;
 
@@ -57,16 +72,12 @@ void bpf_task_storage_free(struct task_struct *task)
        struct bpf_local_storage_elem *selem;
        struct bpf_local_storage *local_storage;
        bool free_task_storage = false;
-       struct bpf_storage_blob *bsb;
        struct hlist_node *n;
-
-       bsb = bpf_task(task);
-       if (!bsb)
-               return;
+       unsigned long flags;
 
        rcu_read_lock();
 
-       local_storage = rcu_dereference(bsb->storage);
+       local_storage = rcu_dereference(task->bpf_storage);
        if (!local_storage) {
                rcu_read_unlock();
                return;
@@ -81,7 +92,8 @@ void bpf_task_storage_free(struct task_struct *task)
         * when unlinking elem from the local_storage->list and
         * the map's bucket->list.
         */
-       raw_spin_lock_bh(&local_storage->lock);
+       bpf_task_storage_lock();
+       raw_spin_lock_irqsave(&local_storage->lock, flags);
        hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
                /* Always unlink from map before unlinking from
                 * local_storage.
@@ -90,7 +102,8 @@ void bpf_task_storage_free(struct task_struct *task)
                free_task_storage = bpf_selem_unlink_storage_nolock(
                        local_storage, selem, false);
        }
-       raw_spin_unlock_bh(&local_storage->lock);
+       raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+       bpf_task_storage_unlock();
        rcu_read_unlock();
 
        /* free_task_storage should always be true as long as
@@ -123,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
                goto out;
        }
 
+       bpf_task_storage_lock();
        sdata = task_storage_lookup(task, map, true);
+       bpf_task_storage_unlock();
        put_pid(pid);
        return sdata ? sdata->data : NULL;
 out:
@@ -150,13 +165,15 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
         */
        WARN_ON_ONCE(!rcu_read_lock_held());
        task = pid_task(pid, PIDTYPE_PID);
-       if (!task || !task_storage_ptr(task)) {
+       if (!task) {
                err = -ENOENT;
                goto out;
        }
 
+       bpf_task_storage_lock();
        sdata = bpf_local_storage_update(
                task, (struct bpf_local_storage_map *)map, value, map_flags);
+       bpf_task_storage_unlock();
 
        err = PTR_ERR_OR_ZERO(sdata);
 out:
@@ -199,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
                goto out;
        }
 
+       bpf_task_storage_lock();
        err = task_storage_delete(task, map);
+       bpf_task_storage_unlock();
 out:
        put_pid(pid);
        return err;
@@ -213,44 +232,47 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
        if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
                return (unsigned long)NULL;
 
-       /* explicitly check that the task_storage_ptr is not
-        * NULL as task_storage_lookup returns NULL in this case and
-        * bpf_local_storage_update expects the owner to have a
-        * valid storage pointer.
-        */
-       if (!task || !task_storage_ptr(task))
+       if (!task)
+               return (unsigned long)NULL;
+
+       if (!bpf_task_storage_trylock())
                return (unsigned long)NULL;
 
        sdata = task_storage_lookup(task, map, true);
        if (sdata)
-               return (unsigned long)sdata->data;
+               goto unlock;
 
-       /* This helper must only be called from places where the lifetime of the task
-        * is guaranteed. Either by being refcounted or by being protected
-        * by an RCU read-side critical section.
-        */
-       if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+       /* only allocate new storage, when the task is refcounted */
+       if (refcount_read(&task->usage) &&
+           (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
                sdata = bpf_local_storage_update(
                        task, (struct bpf_local_storage_map *)map, value,
                        BPF_NOEXIST);
-               return IS_ERR(sdata) ? (unsigned long)NULL :
-                                            (unsigned long)sdata->data;
-       }
 
-       return (unsigned long)NULL;
+unlock:
+       bpf_task_storage_unlock();
+       return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
+               (unsigned long)sdata->data;
 }
 
 BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
           task)
 {
+       int ret;
+
        if (!task)
                return -EINVAL;
 
+       if (!bpf_task_storage_trylock())
+               return -EBUSY;
+
        /* This helper must only be called from places where the lifetime of the task
         * is guaranteed. Either by being refcounted or by being protected
         * by an RCU read-side critical section.
         */
-       return task_storage_delete(task, map);
+       ret = task_storage_delete(task, map);
+       bpf_task_storage_unlock();
+       return ret;
 }
 
 static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -276,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map)
 
        smap = (struct bpf_local_storage_map *)map;
        bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
-       bpf_local_storage_map_free(smap);
+       bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
 }
 
 static int task_storage_map_btf_id;
index b1a76fe..369faed 100644 (file)
 #define BITS_ROUNDUP_BYTES(bits) \
        (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
-#define BTF_INFO_MASK 0x8f00ffff
+#define BTF_INFO_MASK 0x9f00ffff
 #define BTF_INT_MASK 0x0fffffff
 #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
 #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -280,6 +280,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
        [BTF_KIND_FUNC_PROTO]   = "FUNC_PROTO",
        [BTF_KIND_VAR]          = "VAR",
        [BTF_KIND_DATASEC]      = "DATASEC",
+       [BTF_KIND_FLOAT]        = "FLOAT",
 };
 
 static const char *btf_type_str(const struct btf_type *t)
@@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t)
        case BTF_KIND_UNION:
        case BTF_KIND_ENUM:
        case BTF_KIND_DATASEC:
+       case BTF_KIND_FLOAT:
                return true;
        }
 
@@ -1704,6 +1706,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
                case BTF_KIND_STRUCT:
                case BTF_KIND_UNION:
                case BTF_KIND_ENUM:
+               case BTF_KIND_FLOAT:
                        size = type->size;
                        goto resolved;
 
@@ -1849,7 +1852,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env,
        return -EINVAL;
 }
 
-/* Used for ptr, array and struct/union type members.
+/* Used for ptr, array struct/union and float type members.
  * int, enum and modifier types have their specific callback functions.
  */
 static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
@@ -3675,6 +3678,81 @@ static const struct btf_kind_operations datasec_ops = {
        .show                   = btf_datasec_show,
 };
 
+static s32 btf_float_check_meta(struct btf_verifier_env *env,
+                               const struct btf_type *t,
+                               u32 meta_left)
+{
+       if (btf_type_vlen(t)) {
+               btf_verifier_log_type(env, t, "vlen != 0");
+               return -EINVAL;
+       }
+
+       if (btf_type_kflag(t)) {
+               btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+               return -EINVAL;
+       }
+
+       if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
+           t->size != 16) {
+               btf_verifier_log_type(env, t, "Invalid type_size");
+               return -EINVAL;
+       }
+
+       btf_verifier_log_type(env, t, NULL);
+
+       return 0;
+}
+
+static int btf_float_check_member(struct btf_verifier_env *env,
+                                 const struct btf_type *struct_type,
+                                 const struct btf_member *member,
+                                 const struct btf_type *member_type)
+{
+       u64 start_offset_bytes;
+       u64 end_offset_bytes;
+       u64 misalign_bits;
+       u64 align_bytes;
+       u64 align_bits;
+
+       /* Different architectures have different alignment requirements, so
+        * here we check only for the reasonable minimum. This way we ensure
+        * that types after CO-RE can pass the kernel BTF verifier.
+        */
+       align_bytes = min_t(u64, sizeof(void *), member_type->size);
+       align_bits = align_bytes * BITS_PER_BYTE;
+       div64_u64_rem(member->offset, align_bits, &misalign_bits);
+       if (misalign_bits) {
+               btf_verifier_log_member(env, struct_type, member,
+                                       "Member is not properly aligned");
+               return -EINVAL;
+       }
+
+       start_offset_bytes = member->offset / BITS_PER_BYTE;
+       end_offset_bytes = start_offset_bytes + member_type->size;
+       if (end_offset_bytes > struct_type->size) {
+               btf_verifier_log_member(env, struct_type, member,
+                                       "Member exceeds struct_size");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void btf_float_log(struct btf_verifier_env *env,
+                         const struct btf_type *t)
+{
+       btf_verifier_log(env, "size=%u", t->size);
+}
+
+static const struct btf_kind_operations float_ops = {
+       .check_meta = btf_float_check_meta,
+       .resolve = btf_df_resolve,
+       .check_member = btf_float_check_member,
+       .check_kflag_member = btf_generic_check_kflag_member,
+       .log_details = btf_float_log,
+       .show = btf_df_show,
+};
+
 static int btf_func_proto_check(struct btf_verifier_env *env,
                                const struct btf_type *t)
 {
@@ -3808,6 +3886,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
        [BTF_KIND_FUNC_PROTO] = &func_proto_ops,
        [BTF_KIND_VAR] = &var_ops,
        [BTF_KIND_DATASEC] = &datasec_ops,
+       [BTF_KIND_FLOAT] = &float_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4592,8 +4671,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
        }
        arg = off / 8;
        args = (const struct btf_param *)(t + 1);
-       /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
-       nr_args = t ? btf_type_vlen(t) : 5;
+       /* if (t == NULL) Fall back to default BPF prog with
+        * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+        */
+       nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
        if (prog->aux->attach_btf_trace) {
                /* skip first 'void *__data' argument in btf_trace_##name typedef */
                args++;
@@ -4649,7 +4730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                }
        } else {
                if (!t)
-                       /* Default prog with 5 args */
+                       /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
                        return true;
                t = btf_type_by_id(btf, args[arg].type);
        }
@@ -5100,12 +5181,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 
        if (!func) {
                /* BTF function prototype doesn't match the verifier types.
-                * Fall back to 5 u64 args.
+                * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
                 */
-               for (i = 0; i < 5; i++)
+               for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
                        m->arg_size[i] = 8;
                m->ret_size = 8;
-               m->nr_args = 5;
+               m->nr_args = MAX_BPF_FUNC_REG_ARGS;
                return 0;
        }
        args = (const struct btf_param *)(func + 1);
@@ -5328,8 +5409,9 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
        }
        args = (const struct btf_param *)(t + 1);
        nargs = btf_type_vlen(t);
-       if (nargs > 5) {
-               bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
+       if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+               bpf_log(log, "Function %s has %d > %d args\n", tname, nargs,
+                       MAX_BPF_FUNC_REG_ARGS);
                goto out;
        }
 
@@ -5458,9 +5540,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
        }
        args = (const struct btf_param *)(t + 1);
        nargs = btf_type_vlen(t);
-       if (nargs > 5) {
-               bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
-                       tname, nargs);
+       if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+               bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+                       tname, nargs, MAX_BPF_FUNC_REG_ARGS);
                return -EINVAL;
        }
        /* check that function returns int */
index 5d1469d..0cf2791 100644 (file)
@@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map)
         * complete.
         */
 
-       bpf_clear_redirect_map(map);
        synchronize_rcu();
 
        /* For cpu_map the remote CPUs can still be using the entries
@@ -563,7 +562,7 @@ static void cpu_map_free(struct bpf_map *map)
        kfree(cmap);
 }
 
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
        struct bpf_cpu_map_entry *rcpu;
@@ -600,6 +599,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
        return 0;
 }
 
+static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+       return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+}
+
 static int cpu_map_btf_id;
 const struct bpf_map_ops cpu_map_ops = {
        .map_meta_equal         = bpf_map_meta_equal,
@@ -612,6 +616,7 @@ const struct bpf_map_ops cpu_map_ops = {
        .map_check_btf          = map_check_no_btf,
        .map_btf_name           = "bpf_cpu_map",
        .map_btf_id             = &cpu_map_btf_id,
+       .map_redirect           = cpu_map_redirect,
 };
 
 static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
index 85d9d1b..7a5ad73 100644 (file)
@@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map)
        list_del_rcu(&dtab->list);
        spin_unlock(&dev_map_lock);
 
-       bpf_clear_redirect_map(map);
        synchronize_rcu();
 
        /* Make sure prior __dev_map_entry_free() have completed. */
@@ -258,7 +257,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
        return 0;
 }
 
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
 {
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct hlist_head *head = dev_map_index_hash(dtab, key);
@@ -392,7 +391,7 @@ void __dev_flush(void)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *obj;
@@ -735,6 +734,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
                                         map, key, value, map_flags);
 }
 
+static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+       return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+}
+
+static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+       return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+}
+
 static int dev_map_btf_id;
 const struct bpf_map_ops dev_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
@@ -747,6 +756,7 @@ const struct bpf_map_ops dev_map_ops = {
        .map_check_btf = map_check_no_btf,
        .map_btf_name = "bpf_dtab",
        .map_btf_id = &dev_map_btf_id,
+       .map_redirect = dev_map_redirect,
 };
 
 static int dev_map_hash_map_btf_id;
@@ -761,6 +771,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
        .map_check_btf = map_check_no_btf,
        .map_btf_name = "bpf_dtab",
        .map_btf_id = &dev_map_hash_map_btf_id,
+       .map_redirect = dev_hash_map_redirect,
 };
 
 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
index d63912e..330d721 100644 (file)
@@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = {
        .seq_priv_size          = sizeof(struct bpf_iter_seq_hash_map_info),
 };
 
+static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
+                                 void *callback_ctx, u64 flags)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_nulls_head *head;
+       struct hlist_nulls_node *n;
+       struct htab_elem *elem;
+       u32 roundup_key_size;
+       int i, num_elems = 0;
+       void __percpu *pptr;
+       struct bucket *b;
+       void *key, *val;
+       bool is_percpu;
+       u64 ret = 0;
+
+       if (flags != 0)
+               return -EINVAL;
+
+       is_percpu = htab_is_percpu(htab);
+
+       roundup_key_size = round_up(map->key_size, 8);
+       /* disable migration so percpu value prepared here will be the
+        * same as the one seen by the bpf program with bpf_map_lookup_elem().
+        */
+       if (is_percpu)
+               migrate_disable();
+       for (i = 0; i < htab->n_buckets; i++) {
+               b = &htab->buckets[i];
+               rcu_read_lock();
+               head = &b->head;
+               hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+                       key = elem->key;
+                       if (is_percpu) {
+                               /* current cpu value for percpu map */
+                               pptr = htab_elem_get_ptr(elem, map->key_size);
+                               val = this_cpu_ptr(pptr);
+                       } else {
+                               val = elem->key + roundup_key_size;
+                       }
+                       num_elems++;
+                       ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+                                       (u64)(long)key, (u64)(long)val,
+                                       (u64)(long)callback_ctx, 0);
+                       /* return value: 0 - continue, 1 - stop and return */
+                       if (ret) {
+                               rcu_read_unlock();
+                               goto out;
+                       }
+               }
+               rcu_read_unlock();
+       }
+out:
+       if (is_percpu)
+               migrate_enable();
+       return num_elems;
+}
+
 static int htab_map_btf_id;
 const struct bpf_map_ops htab_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
@@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = {
        .map_delete_elem = htab_map_delete_elem,
        .map_gen_lookup = htab_map_gen_lookup,
        .map_seq_show_elem = htab_map_seq_show_elem,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_for_each_hash_elem,
        BATCH_OPS(htab),
        .map_btf_name = "bpf_htab",
        .map_btf_id = &htab_map_btf_id,
@@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = {
        .map_delete_elem = htab_lru_map_delete_elem,
        .map_gen_lookup = htab_lru_map_gen_lookup,
        .map_seq_show_elem = htab_map_seq_show_elem,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_for_each_hash_elem,
        BATCH_OPS(htab_lru),
        .map_btf_name = "bpf_htab",
        .map_btf_id = &htab_lru_map_btf_id,
@@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = {
        .map_update_elem = htab_percpu_map_update_elem,
        .map_delete_elem = htab_map_delete_elem,
        .map_seq_show_elem = htab_percpu_map_seq_show_elem,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_for_each_hash_elem,
        BATCH_OPS(htab_percpu),
        .map_btf_name = "bpf_htab",
        .map_btf_id = &htab_percpu_map_btf_id,
@@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
        .map_update_elem = htab_lru_percpu_map_update_elem,
        .map_delete_elem = htab_lru_map_delete_elem,
        .map_seq_show_elem = htab_percpu_map_seq_show_elem,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_for_each_hash_elem,
        BATCH_OPS(htab_lru_percpu),
        .map_btf_name = "bpf_htab",
        .map_btf_id = &htab_lru_percpu_map_btf_id,
index 308427f..0748002 100644 (file)
@@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                return &bpf_ringbuf_discard_proto;
        case BPF_FUNC_ringbuf_query:
                return &bpf_ringbuf_query_proto;
+       case BPF_FUNC_for_each_map_elem:
+               return &bpf_for_each_map_elem_proto;
        default:
                break;
        }
index c56e3fc..f9096b0 100644 (file)
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
               insn->src_reg == BPF_PSEUDO_CALL;
 }
 
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+       return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+              insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
 struct bpf_call_arg_meta {
        struct bpf_map *map_ptr;
        bool raw_mode;
@@ -248,6 +254,7 @@ struct bpf_call_arg_meta {
        u32 btf_id;
        struct btf *ret_btf;
        u32 ret_btf_id;
+       u32 subprogno;
 };
 
 struct btf *btf_vmlinux;
@@ -390,6 +397,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
        env->prev_linfo = linfo;
 }
 
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+                                  struct bpf_reg_state *reg,
+                                  struct tnum *range, const char *ctx,
+                                  const char *reg_name)
+{
+       char tn_buf[48];
+
+       verbose(env, "At %s the register %s ", ctx, reg_name);
+       if (!tnum_is_unknown(reg->var_off)) {
+               tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+               verbose(env, "has value %s", tn_buf);
+       } else {
+               verbose(env, "has unknown scalar value");
+       }
+       tnum_strn(tn_buf, sizeof(tn_buf), *range);
+       verbose(env, " should have been in %s\n", tn_buf);
+}
+
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
 {
        return type == PTR_TO_PACKET ||
@@ -409,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_TCP_SOCK ||
                type == PTR_TO_MAP_VALUE ||
+               type == PTR_TO_MAP_KEY ||
                type == PTR_TO_SOCK_COMMON;
 }
 
@@ -451,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type)
               type == ARG_PTR_TO_MEM_OR_NULL ||
               type == ARG_PTR_TO_CTX_OR_NULL ||
               type == ARG_PTR_TO_SOCKET_OR_NULL ||
-              type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+              type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+              type == ARG_PTR_TO_STACK_OR_NULL;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -541,6 +568,8 @@ static const char * const reg_type_str[] = {
        [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
        [PTR_TO_RDWR_BUF]       = "rdwr_buf",
        [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+       [PTR_TO_FUNC]           = "func",
+       [PTR_TO_MAP_KEY]        = "map_key",
 };
 
 static char slot_type_char[] = {
@@ -612,6 +641,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                        if (type_is_pkt_pointer(t))
                                verbose(env, ",r=%d", reg->range);
                        else if (t == CONST_PTR_TO_MAP ||
+                                t == PTR_TO_MAP_KEY ||
                                 t == PTR_TO_MAP_VALUE ||
                                 t == PTR_TO_MAP_VALUE_OR_NULL)
                                verbose(env, ",ks=%d,vs=%d",
@@ -1519,7 +1549,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
        }
        ret = find_subprog(env, off);
        if (ret >= 0)
-               return 0;
+               return ret;
        if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
                verbose(env, "too many subprograms\n");
                return -E2BIG;
@@ -1527,7 +1557,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
        env->subprog_info[env->subprog_cnt++].start = off;
        sort(env->subprog_info, env->subprog_cnt,
             sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
-       return 0;
+       return env->subprog_cnt - 1;
 }
 
 static int check_subprogs(struct bpf_verifier_env *env)
@@ -1544,6 +1574,19 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
        /* determine subprog starts. The end is one before the next starts */
        for (i = 0; i < insn_cnt; i++) {
+               if (bpf_pseudo_func(insn + i)) {
+                       if (!env->bpf_capable) {
+                               verbose(env,
+                                       "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+                               return -EPERM;
+                       }
+                       ret = add_subprog(env, i + insn[i].imm + 1);
+                       if (ret < 0)
+                               return ret;
+                       /* remember subprog */
+                       insn[i + 1].imm = ret;
+                       continue;
+               }
                if (!bpf_pseudo_call(insn + i))
                        continue;
                if (!env->bpf_capable) {
@@ -2295,6 +2338,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
        case PTR_TO_PERCPU_BTF_ID:
        case PTR_TO_MEM:
        case PTR_TO_MEM_OR_NULL:
+       case PTR_TO_FUNC:
+       case PTR_TO_MAP_KEY:
                return true;
        default:
                return false;
@@ -2899,6 +2944,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
 
        reg = &cur_regs(env)[regno];
        switch (reg->type) {
+       case PTR_TO_MAP_KEY:
+               verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+                       mem_size, off, size);
+               break;
        case PTR_TO_MAP_VALUE:
                verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                        mem_size, off, size);
@@ -3304,6 +3353,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
        case PTR_TO_FLOW_KEYS:
                pointer_desc = "flow keys ";
                break;
+       case PTR_TO_MAP_KEY:
+               pointer_desc = "key ";
+               break;
        case PTR_TO_MAP_VALUE:
                pointer_desc = "value ";
                break;
@@ -3405,7 +3457,7 @@ process_func:
 continue_func:
        subprog_end = subprog[idx + 1].start;
        for (; i < subprog_end; i++) {
-               if (!bpf_pseudo_call(insn + i))
+               if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
                        continue;
                /* remember insn and function to return to */
                ret_insn[frame] = i + 1;
@@ -3842,7 +3894,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
        /* for access checks, reg->off is just part of off */
        off += reg->off;
 
-       if (reg->type == PTR_TO_MAP_VALUE) {
+       if (reg->type == PTR_TO_MAP_KEY) {
+               if (t == BPF_WRITE) {
+                       verbose(env, "write to change key R%d not allowed\n", regno);
+                       return -EACCES;
+               }
+
+               err = check_mem_region_access(env, regno, off, size,
+                                             reg->map_ptr->key_size, false);
+               if (err)
+                       return err;
+               if (value_regno >= 0)
+                       mark_reg_unknown(env, regs, value_regno);
+       } else if (reg->type == PTR_TO_MAP_VALUE) {
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4258,6 +4322,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
        case PTR_TO_PACKET_META:
                return check_packet_access(env, regno, reg->off, access_size,
                                           zero_size_allowed);
+       case PTR_TO_MAP_KEY:
+               return check_mem_region_access(env, regno, reg->off, access_size,
+                                              reg->map_ptr->key_size, false);
        case PTR_TO_MAP_VALUE:
                if (check_map_access_type(env, regno, reg->off, access_size,
                                          meta && meta->raw_mode ? BPF_WRITE :
@@ -4474,6 +4541,7 @@ static const struct bpf_reg_types map_key_value_types = {
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
+               PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
        },
 };
@@ -4505,6 +4573,7 @@ static const struct bpf_reg_types mem_types = {
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
+               PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
                PTR_TO_MEM,
                PTR_TO_RDONLY_BUF,
@@ -4517,6 +4586,7 @@ static const struct bpf_reg_types int_ptr_types = {
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
+               PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
        },
 };
@@ -4529,6 +4599,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_MAP_KEY]            = &map_key_value_types,
@@ -4557,6 +4629,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_INT]                = &int_ptr_types,
        [ARG_PTR_TO_LONG]               = &int_ptr_types,
        [ARG_PTR_TO_PERCPU_BTF_ID]      = &percpu_btf_ptr_types,
+       [ARG_PTR_TO_FUNC]               = &func_ptr_types,
+       [ARG_PTR_TO_STACK_OR_NULL]      = &stack_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4738,6 +4812,8 @@ skip_type_check:
                        verbose(env, "verifier internal error\n");
                        return -EFAULT;
                }
+       } else if (arg_type == ARG_PTR_TO_FUNC) {
+               meta->subprogno = reg->subprogno;
        } else if (arg_type_is_mem_ptr(arg_type)) {
                /* The access to this pointer is only checked when we hit the
                 * next is_mem_size argument below.
@@ -5258,13 +5334,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
        }
 }
 
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-                          int *insn_idx)
+typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+                                  struct bpf_func_state *caller,
+                                  struct bpf_func_state *callee,
+                                  int insn_idx);
+
+static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                            int *insn_idx, int subprog,
+                            set_callee_state_fn set_callee_state_cb)
 {
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_func_info_aux *func_info_aux;
        struct bpf_func_state *caller, *callee;
-       int i, err, subprog, target_insn;
+       int err;
        bool is_global = false;
 
        if (state->curframe + 1 >= MAX_CALL_FRAMES) {
@@ -5273,14 +5355,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                return -E2BIG;
        }
 
-       target_insn = *insn_idx + insn->imm;
-       subprog = find_subprog(env, target_insn + 1);
-       if (subprog < 0) {
-               verbose(env, "verifier bug. No program starts at insn %d\n",
-                       target_insn + 1);
-               return -EFAULT;
-       }
-
        caller = state->frame[state->curframe];
        if (state->frame[state->curframe + 1]) {
                verbose(env, "verifier bug. Frame %d already allocated\n",
@@ -5335,11 +5409,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
        if (err)
                return err;
 
-       /* copy r1 - r5 args that callee can access.  The copy includes parent
-        * pointers, which connects us up to the liveness chain
-        */
-       for (i = BPF_REG_1; i <= BPF_REG_5; i++)
-               callee->regs[i] = caller->regs[i];
+       err = set_callee_state_cb(env, caller, callee, *insn_idx);
+       if (err)
+               return err;
 
        clear_caller_saved_regs(env, caller->regs);
 
@@ -5347,7 +5419,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
        state->curframe++;
 
        /* and go analyze first insn of the callee */
-       *insn_idx = target_insn;
+       *insn_idx = env->subprog_info[subprog].start - 1;
 
        if (env->log.level & BPF_LOG_LEVEL) {
                verbose(env, "caller:\n");
@@ -5358,6 +5430,92 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
        return 0;
 }
 
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+                                  struct bpf_func_state *caller,
+                                  struct bpf_func_state *callee)
+{
+       /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+        *      void *callback_ctx, u64 flags);
+        * callback_fn(struct bpf_map *map, void *key, void *value,
+        *      void *callback_ctx);
+        */
+       callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+       callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+       __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+       callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+       callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+       __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+       callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+       /* pointer to stack or null */
+       callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+       /* unused */
+       __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+       return 0;
+}
+
+static int set_callee_state(struct bpf_verifier_env *env,
+                           struct bpf_func_state *caller,
+                           struct bpf_func_state *callee, int insn_idx)
+{
+       int i;
+
+       /* copy r1 - r5 args that callee can access.  The copy includes parent
+        * pointers, which connects us up to the liveness chain
+        */
+       for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+               callee->regs[i] = caller->regs[i];
+       return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                          int *insn_idx)
+{
+       int subprog, target_insn;
+
+       target_insn = *insn_idx + insn->imm + 1;
+       subprog = find_subprog(env, target_insn);
+       if (subprog < 0) {
+               verbose(env, "verifier bug. No program starts at insn %d\n",
+                       target_insn);
+               return -EFAULT;
+       }
+
+       return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
+}
+
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+                                      struct bpf_func_state *caller,
+                                      struct bpf_func_state *callee,
+                                      int insn_idx)
+{
+       struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+       struct bpf_map *map;
+       int err;
+
+       if (bpf_map_ptr_poisoned(insn_aux)) {
+               verbose(env, "tail_call abusing map_ptr\n");
+               return -EINVAL;
+       }
+
+       map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+       if (!map->ops->map_set_for_each_callback_args ||
+           !map->ops->map_for_each_callback) {
+               verbose(env, "callback function not allowed for map\n");
+               return -ENOTSUPP;
+       }
+
+       err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+       if (err)
+               return err;
+
+       callee->in_callback_fn = true;
+       return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
        struct bpf_verifier_state *state = env->cur_state;
@@ -5380,8 +5538,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 
        state->curframe--;
        caller = state->frame[state->curframe];
-       /* return to the caller whatever r0 had in the callee */
-       caller->regs[BPF_REG_0] = *r0;
+       if (callee->in_callback_fn) {
+               /* enforce R0 return value range [0, 1]. */
+               struct tnum range = tnum_range(0, 1);
+
+               if (r0->type != SCALAR_VALUE) {
+                       verbose(env, "R0 not a scalar value\n");
+                       return -EACCES;
+               }
+               if (!tnum_in(range, r0->var_off)) {
+                       verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+                       return -EINVAL;
+               }
+       } else {
+               /* return to the caller whatever r0 had in the callee */
+               caller->regs[BPF_REG_0] = *r0;
+       }
 
        /* Transfer references to the caller */
        err = transfer_reference_state(caller, callee);
@@ -5436,7 +5608,9 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
            func_id != BPF_FUNC_map_delete_elem &&
            func_id != BPF_FUNC_map_push_elem &&
            func_id != BPF_FUNC_map_pop_elem &&
-           func_id != BPF_FUNC_map_peek_elem)
+           func_id != BPF_FUNC_map_peek_elem &&
+           func_id != BPF_FUNC_for_each_map_elem &&
+           func_id != BPF_FUNC_redirect_map)
                return 0;
 
        if (map == NULL) {
@@ -5517,15 +5691,18 @@ static int check_reference_leak(struct bpf_verifier_env *env)
        return state->acquired_refs ? -EINVAL : 0;
 }
 
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                            int *insn_idx_p)
 {
        const struct bpf_func_proto *fn = NULL;
        struct bpf_reg_state *regs;
        struct bpf_call_arg_meta meta;
+       int insn_idx = *insn_idx_p;
        bool changes_data;
-       int i, err;
+       int i, err, func_id;
 
        /* find function prototype */
+       func_id = insn->imm;
        if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
                verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
                        func_id);
@@ -5571,7 +5748,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
        meta.func_id = func_id;
        /* check args */
-       for (i = 0; i < 5; i++) {
+       for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
                err = check_func_arg(env, i, &meta, fn);
                if (err)
                        return err;
@@ -5621,6 +5798,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                return -EINVAL;
        }
 
+       if (func_id == BPF_FUNC_for_each_map_elem) {
+               err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+                                       set_map_elem_callback_state);
+               if (err < 0)
+                       return -EINVAL;
+       }
+
        /* reset caller saved regs */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                mark_reg_not_init(env, regs, caller_saved[i]);
@@ -5874,6 +6058,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
                else
                        *ptr_limit = -off;
                return 0;
+       case PTR_TO_MAP_KEY:
+               /* Currently, this code is not exercised as the only use
+                * is bpf_for_each_map_elem() helper which requires
+                * bpf_capble. The code has been tested manually for
+                * future use.
+                */
+               if (mask_to_left) {
+                       *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+               } else {
+                       off = ptr_reg->smin_value + ptr_reg->off;
+                       *ptr_limit = ptr_reg->map_ptr->key_size - off;
+               }
+               return 0;
        case PTR_TO_MAP_VALUE:
                if (mask_to_left) {
                        *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
@@ -5904,7 +6101,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
             aux->alu_limit != alu_limit))
                return -EACCES;
 
-       /* Corresponding fixup done in fixup_bpf_calls(). */
+       /* Corresponding fixup done in do_misc_fixups(). */
        aux->alu_state = alu_state;
        aux->alu_limit = alu_limit;
        return 0;
@@ -6075,6 +6272,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                        dst, reg_type_str[ptr_reg->type]);
                return -EACCES;
+       case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
                if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
                        verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
@@ -8254,6 +8452,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
                return 0;
        }
 
+       if (insn->src_reg == BPF_PSEUDO_FUNC) {
+               struct bpf_prog_aux *aux = env->prog->aux;
+               u32 subprogno = insn[1].imm;
+
+               if (!aux->func_info) {
+                       verbose(env, "missing btf func_info\n");
+                       return -EINVAL;
+               }
+               if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+                       verbose(env, "callback function not static\n");
+                       return -EINVAL;
+               }
+
+               dst_reg->type = PTR_TO_FUNC;
+               dst_reg->subprogno = subprogno;
+               return 0;
+       }
+
        map = env->used_maps[aux->map_index];
        mark_reg_known_zero(env, regs, insn->dst_reg);
        dst_reg->map_ptr = map;
@@ -8482,17 +8698,7 @@ static int check_return_code(struct bpf_verifier_env *env)
        }
 
        if (!tnum_in(range, reg->var_off)) {
-               char tn_buf[48];
-
-               verbose(env, "At program exit the register R0 ");
-               if (!tnum_is_unknown(reg->var_off)) {
-                       tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "has value %s", tn_buf);
-               } else {
-                       verbose(env, "has unknown scalar value");
-               }
-               tnum_strn(tn_buf, sizeof(tn_buf), range);
-               verbose(env, " should have been in %s\n", tn_buf);
+               verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
                return -EINVAL;
        }
 
@@ -8619,6 +8825,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
        return DONE_EXPLORING;
 }
 
+static int visit_func_call_insn(int t, int insn_cnt,
+                               struct bpf_insn *insns,
+                               struct bpf_verifier_env *env,
+                               bool visit_callee)
+{
+       int ret;
+
+       ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+       if (ret)
+               return ret;
+
+       if (t + 1 < insn_cnt)
+               init_explored_state(env, t + 1);
+       if (visit_callee) {
+               init_explored_state(env, t);
+               ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+                               env, false);
+       }
+       return ret;
+}
+
 /* Visits the instruction at index t and returns one of the following:
  *  < 0 - an error occurred
  *  DONE_EXPLORING - the instruction was fully explored
@@ -8629,6 +8856,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
        struct bpf_insn *insns = env->prog->insnsi;
        int ret;
 
+       if (bpf_pseudo_func(insns + t))
+               return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
        /* All non-branch instructions have a single fall-through edge. */
        if (BPF_CLASS(insns[t].code) != BPF_JMP &&
            BPF_CLASS(insns[t].code) != BPF_JMP32)
@@ -8639,18 +8869,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
                return DONE_EXPLORING;
 
        case BPF_CALL:
-               ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
-               if (ret)
-                       return ret;
-
-               if (t + 1 < insn_cnt)
-                       init_explored_state(env, t + 1);
-               if (insns[t].src_reg == BPF_PSEUDO_CALL) {
-                       init_explored_state(env, t);
-                       ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
-                                       env, false);
-               }
-               return ret;
+               return visit_func_call_insn(t, insn_cnt, insns, env,
+                                           insns[t].src_reg == BPF_PSEUDO_CALL);
 
        case BPF_JA:
                if (BPF_SRC(insns[t].code) != BPF_K)
@@ -9259,6 +9479,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                         */
                        return false;
                }
+       case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
                /* If the new min/max/var_off satisfy the old ones and
                 * everything else matches, we are OK.
@@ -10105,10 +10326,9 @@ static int do_check(struct bpf_verifier_env *env)
                                if (insn->src_reg == BPF_PSEUDO_CALL)
                                        err = check_func_call(env, insn, &env->insn_idx);
                                else
-                                       err = check_helper_call(env, insn->imm, env->insn_idx);
+                                       err = check_helper_call(env, insn, &env->insn_idx);
                                if (err)
                                        return err;
-
                        } else if (opcode == BPF_JA) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    insn->imm != 0 ||
@@ -10537,6 +10757,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
                                goto next_insn;
                        }
 
+                       if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+                               aux = &env->insn_aux_data[i];
+                               aux->ptr_type = PTR_TO_FUNC;
+                               goto next_insn;
+                       }
+
                        /* In final convert_pseudo_ld_imm64() step, this is
                         * converted into regular 64-bit imm load insn.
                         */
@@ -10669,9 +10895,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
        int insn_cnt = env->prog->len;
        int i;
 
-       for (i = 0; i < insn_cnt; i++, insn++)
-               if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
-                       insn->src_reg = 0;
+       for (i = 0; i < insn_cnt; i++, insn++) {
+               if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+                       continue;
+               if (insn->src_reg == BPF_PSEUDO_FUNC)
+                       continue;
+               insn->src_reg = 0;
+       }
 }
 
 /* single env->prog->insni[off] instruction was replaced with the range
@@ -11310,6 +11540,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
                return 0;
 
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+               if (bpf_pseudo_func(insn)) {
+                       env->insn_aux_data[i].call_imm = insn->imm;
+                       /* subprog is encoded in insn[1].imm */
+                       continue;
+               }
+
                if (!bpf_pseudo_call(insn))
                        continue;
                /* Upon error here we cannot fall back to interpreter but
@@ -11439,6 +11675,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
        for (i = 0; i < env->subprog_cnt; i++) {
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
+                       if (bpf_pseudo_func(insn)) {
+                               subprog = insn[1].imm;
+                               insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+                               insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+                               continue;
+                       }
                        if (!bpf_pseudo_call(insn))
                                continue;
                        subprog = insn->off;
@@ -11484,6 +11726,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
         * later look the same as if they were interpreted only.
         */
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+               if (bpf_pseudo_func(insn)) {
+                       insn[0].imm = env->insn_aux_data[i].call_imm;
+                       insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+                       continue;
+               }
                if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = env->insn_aux_data[i].call_imm;
@@ -11548,6 +11795,14 @@ static int fixup_call_args(struct bpf_verifier_env *env)
                return -EINVAL;
        }
        for (i = 0; i < prog->len; i++, insn++) {
+               if (bpf_pseudo_func(insn)) {
+                       /* When JIT fails the progs with callback calls
+                        * have to be rejected, since interpreter doesn't support them yet.
+                        */
+                       verbose(env, "callbacks are not allowed in non-JITed programs\n");
+                       return -EINVAL;
+               }
+
                if (!bpf_pseudo_call(insn))
                        continue;
                depth = get_callee_stack_depth(env, insn, i);
@@ -11560,12 +11815,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
        return err;
 }
 
-/* fixup insn->imm field of bpf_call instructions
- * and inline eligible helpers as explicit sequence of BPF instructions
- *
- * this function is called after eBPF program passed verification
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
  */
-static int fixup_bpf_calls(struct bpf_verifier_env *env)
+static int do_misc_fixups(struct bpf_verifier_env *env)
 {
        struct bpf_prog *prog = env->prog;
        bool expect_blinding = bpf_jit_blinding_enabled(prog);
@@ -11580,6 +11833,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
        int i, ret, cnt, delta = 0;
 
        for (i = 0; i < insn_cnt; i++, insn++) {
+               /* Make divide-by-zero exceptions impossible. */
                if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
@@ -11620,6 +11874,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                        continue;
                }
 
+               /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
                if (BPF_CLASS(insn->code) == BPF_LD &&
                    (BPF_MODE(insn->code) == BPF_ABS ||
                     BPF_MODE(insn->code) == BPF_IND)) {
@@ -11639,6 +11894,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                        continue;
                }
 
+               /* Rewrite pointer arithmetic to mitigate speculation attacks. */
                if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
                        const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
@@ -11787,7 +12043,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                     insn->imm == BPF_FUNC_map_delete_elem ||
                     insn->imm == BPF_FUNC_map_push_elem   ||
                     insn->imm == BPF_FUNC_map_pop_elem    ||
-                    insn->imm == BPF_FUNC_map_peek_elem)) {
+                    insn->imm == BPF_FUNC_map_peek_elem   ||
+                    insn->imm == BPF_FUNC_redirect_map)) {
                        aux = &env->insn_aux_data[i + delta];
                        if (bpf_map_ptr_poisoned(aux))
                                goto patch_call_imm;
@@ -11829,6 +12086,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                                     (int (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
                                     (int (*)(struct bpf_map *map, void *value))NULL));
+                       BUILD_BUG_ON(!__same_type(ops->map_redirect,
+                                    (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+
 patch_map_ops_generic:
                        switch (insn->imm) {
                        case BPF_FUNC_map_lookup_elem:
@@ -11855,11 +12115,16 @@ patch_map_ops_generic:
                                insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
                                            __bpf_call_base;
                                continue;
+                       case BPF_FUNC_redirect_map:
+                               insn->imm = BPF_CAST_CALL(ops->map_redirect) -
+                                           __bpf_call_base;
+                               continue;
                        }
 
                        goto patch_call_imm;
                }
 
+               /* Implement bpf_jiffies64 inline. */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_jiffies64) {
                        struct bpf_insn ld_jiffies_addr[2] = {
@@ -12670,7 +12935,7 @@ skip_full_check:
                ret = convert_ctx_accesses(env);
 
        if (ret == 0)
-               ret = fixup_bpf_calls(env);
+               ret = do_misc_fixups(env);
 
        /* do 32-bit optimization after insn patching has done so those patched
         * insns could be handled correctly.
index d3171e8..b94391a 100644 (file)
@@ -96,6 +96,7 @@
 #include <linux/kasan.h>
 #include <linux/scs.h>
 #include <linux/io_uring.h>
+#include <linux/bpf.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
+       bpf_task_storage_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@ -2064,6 +2066,9 @@ static __latent_entropy struct task_struct *copy_process(
        p->sequential_io        = 0;
        p->sequential_io_avg    = 0;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+       RCU_INIT_POINTER(p->bpf_storage, NULL);
+#endif
 
        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
index b0c45d9..0d23755 100644 (file)
@@ -1367,6 +1367,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_per_cpu_ptr_proto;
        case BPF_FUNC_this_cpu_ptr:
                return &bpf_this_cpu_ptr_proto;
+       case BPF_FUNC_task_storage_get:
+               return &bpf_task_storage_get_proto;
+       case BPF_FUNC_task_storage_delete:
+               return &bpf_task_storage_delete_proto;
+       case BPF_FUNC_for_each_map_elem:
+               return &bpf_for_each_map_elem_proto;
        default:
                return NULL;
        }
index 8cea808..0ead7ec 100644 (file)
@@ -317,13 +317,9 @@ config BPF_STREAM_PARSER
        select STREAM_PARSER
        select NET_SOCK_MSG
        help
-         Enabling this allows a stream parser to be used with
+         Enabling this allows a TCP stream parser to be used with
          BPF_MAP_TYPE_SOCKMAP.
 
-         BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
-         It can be used to enforce socket policy, implement socket redirects,
-         etc.
-
 config NET_FLOW_LIMIT
        bool
        depends on RPS
index 58bcb8c..0abdd67 100644 (file)
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/net_namespace.h>
 #include <linux/error-injection.h>
 #include <linux/smp.h>
+#include <linux/sock_diag.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
 
+struct bpf_test_timer {
+       enum { NO_PREEMPT, NO_MIGRATE } mode;
+       u32 i;
+       u64 time_start, time_spent;
+};
+
+static void bpf_test_timer_enter(struct bpf_test_timer *t)
+       __acquires(rcu)
+{
+       rcu_read_lock();
+       if (t->mode == NO_PREEMPT)
+               preempt_disable();
+       else
+               migrate_disable();
+
+       t->time_start = ktime_get_ns();
+}
+
+static void bpf_test_timer_leave(struct bpf_test_timer *t)
+       __releases(rcu)
+{
+       t->time_start = 0;
+
+       if (t->mode == NO_PREEMPT)
+               preempt_enable();
+       else
+               migrate_enable();
+       rcu_read_unlock();
+}
+
+static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration)
+       __must_hold(rcu)
+{
+       t->i++;
+       if (t->i >= repeat) {
+               /* We're done. */
+               t->time_spent += ktime_get_ns() - t->time_start;
+               do_div(t->time_spent, t->i);
+               *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent;
+               *err = 0;
+               goto reset;
+       }
+
+       if (signal_pending(current)) {
+               /* During iteration: we've been cancelled, abort. */
+               *err = -EINTR;
+               goto reset;
+       }
+
+       if (need_resched()) {
+               /* During iteration: we need to reschedule between runs. */
+               t->time_spent += ktime_get_ns() - t->time_start;
+               bpf_test_timer_leave(t);
+               cond_resched();
+               bpf_test_timer_enter(t);
+       }
+
+       /* Do another round. */
+       return true;
+
+reset:
+       t->i = 0;
+       return false;
+}
+
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
                        u32 *retval, u32 *time, bool xdp)
 {
        struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
+       struct bpf_test_timer t = { NO_MIGRATE };
        enum bpf_cgroup_storage_type stype;
-       u64 time_start, time_spent = 0;
-       int ret = 0;
-       u32 i;
+       int ret;
 
        for_each_cgroup_storage_type(stype) {
                storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
@@ -38,40 +104,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
        if (!repeat)
                repeat = 1;
 
-       rcu_read_lock();
-       migrate_disable();
-       time_start = ktime_get_ns();
-       for (i = 0; i < repeat; i++) {
+       bpf_test_timer_enter(&t);
+       do {
                bpf_cgroup_storage_set(storage);
 
                if (xdp)
                        *retval = bpf_prog_run_xdp(prog, ctx);
                else
                        *retval = BPF_PROG_RUN(prog, ctx);
-
-               if (signal_pending(current)) {
-                       ret = -EINTR;
-                       break;
-               }
-
-               if (need_resched()) {
-                       time_spent += ktime_get_ns() - time_start;
-                       migrate_enable();
-                       rcu_read_unlock();
-
-                       cond_resched();
-
-                       rcu_read_lock();
-                       migrate_disable();
-                       time_start = ktime_get_ns();
-               }
-       }
-       time_spent += ktime_get_ns() - time_start;
-       migrate_enable();
-       rcu_read_unlock();
-
-       do_div(time_spent, repeat);
-       *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
+       } while (bpf_test_timer_continue(&t, repeat, &ret, time));
+       bpf_test_timer_leave(&t);
 
        for_each_cgroup_storage_type(stype)
                bpf_cgroup_storage_free(storage[stype]);
@@ -674,18 +716,17 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                     const union bpf_attr *kattr,
                                     union bpf_attr __user *uattr)
 {
+       struct bpf_test_timer t = { NO_PREEMPT };
        u32 size = kattr->test.data_size_in;
        struct bpf_flow_dissector ctx = {};
        u32 repeat = kattr->test.repeat;
        struct bpf_flow_keys *user_ctx;
        struct bpf_flow_keys flow_keys;
-       u64 time_start, time_spent = 0;
        const struct ethhdr *eth;
        unsigned int flags = 0;
        u32 retval, duration;
        void *data;
        int ret;
-       u32 i;
 
        if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
                return -EINVAL;
@@ -721,48 +762,127 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
        ctx.data = data;
        ctx.data_end = (__u8 *)data + size;
 
-       rcu_read_lock();
-       preempt_disable();
-       time_start = ktime_get_ns();
-       for (i = 0; i < repeat; i++) {
+       bpf_test_timer_enter(&t);
+       do {
                retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
                                          size, flags);
+       } while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+       bpf_test_timer_leave(&t);
 
-               if (signal_pending(current)) {
-                       preempt_enable();
-                       rcu_read_unlock();
+       if (ret < 0)
+               goto out;
 
-                       ret = -EINTR;
-                       goto out;
-               }
+       ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
+                             retval, duration);
+       if (!ret)
+               ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+                                    sizeof(struct bpf_flow_keys));
 
-               if (need_resched()) {
-                       time_spent += ktime_get_ns() - time_start;
-                       preempt_enable();
-                       rcu_read_unlock();
+out:
+       kfree(user_ctx);
+       kfree(data);
+       return ret;
+}
 
-                       cond_resched();
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr,
+                               union bpf_attr __user *uattr)
+{
+       struct bpf_test_timer t = { NO_PREEMPT };
+       struct bpf_prog_array *progs = NULL;
+       struct bpf_sk_lookup_kern ctx = {};
+       u32 repeat = kattr->test.repeat;
+       struct bpf_sk_lookup *user_ctx;
+       u32 retval, duration;
+       int ret = -EINVAL;
 
-                       rcu_read_lock();
-                       preempt_disable();
-                       time_start = ktime_get_ns();
-               }
+       if (prog->type != BPF_PROG_TYPE_SK_LOOKUP)
+               return -EINVAL;
+
+       if (kattr->test.flags || kattr->test.cpu)
+               return -EINVAL;
+
+       if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
+           kattr->test.data_size_out)
+               return -EINVAL;
+
+       if (!repeat)
+               repeat = 1;
+
+       user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx));
+       if (IS_ERR(user_ctx))
+               return PTR_ERR(user_ctx);
+
+       if (!user_ctx)
+               return -EINVAL;
+
+       if (user_ctx->sk)
+               goto out;
+
+       if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
+               goto out;
+
+       if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+               ret = -ERANGE;
+               goto out;
        }
-       time_spent += ktime_get_ns() - time_start;
-       preempt_enable();
-       rcu_read_unlock();
 
-       do_div(time_spent, repeat);
-       duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
+       ctx.family = (u16)user_ctx->family;
+       ctx.protocol = (u16)user_ctx->protocol;
+       ctx.dport = (u16)user_ctx->local_port;
+       ctx.sport = (__force __be16)user_ctx->remote_port;
 
-       ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
-                             retval, duration);
+       switch (ctx.family) {
+       case AF_INET:
+               ctx.v4.daddr = (__force __be32)user_ctx->local_ip4;
+               ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4;
+               break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+       case AF_INET6:
+               ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6;
+               ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6;
+               break;
+#endif
+
+       default:
+               ret = -EAFNOSUPPORT;
+               goto out;
+       }
+
+       progs = bpf_prog_array_alloc(1, GFP_KERNEL);
+       if (!progs) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       progs->items[0].prog = prog;
+
+       bpf_test_timer_enter(&t);
+       do {
+               ctx.selected_sk = NULL;
+               retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN);
+       } while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+       bpf_test_timer_leave(&t);
+
+       if (ret < 0)
+               goto out;
+
+       user_ctx->cookie = 0;
+       if (ctx.selected_sk) {
+               if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) {
+                       ret = -EOPNOTSUPP;
+                       goto out;
+               }
+
+               user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
+       }
+
+       ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration);
        if (!ret)
-               ret = bpf_ctx_finish(kattr, uattr, user_ctx,
-                                    sizeof(struct bpf_flow_keys));
+               ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
 
 out:
+       bpf_prog_array_free(progs);
        kfree(user_ctx);
-       kfree(data);
        return ret;
 }
index 3e2c378..0c2233c 100644 (file)
@@ -16,7 +16,6 @@ obj-y              += dev.o dev_addr_lists.o dst.o netevent.o \
 obj-y += net-sysfs.o
 obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
-obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
 obj-$(CONFIG_FAILOVER) += failover.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
index 4edd033..cc3712a 100644 (file)
@@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
        smap = (struct bpf_local_storage_map *)map;
        bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
-       bpf_local_storage_map_free(smap);
+       bpf_local_storage_map_free(smap, NULL);
 }
 
 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
index adfdad2..b673200 100644 (file)
@@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = {
 static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
 {
-       int err = __bpf_try_make_writable(skb, write_len);
-
-       bpf_compute_data_end_sk_skb(skb);
-       return err;
+       return __bpf_try_make_writable(skb, write_len);
 }
 
 BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -3412,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+                                        BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK))
 
@@ -3448,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;
 
+               if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+                   inner_mac_len < ETH_HLEN)
+                       return -EINVAL;
+
                if (skb->encapsulation)
                        return -EALREADY;
 
@@ -3466,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;
-               skb_set_inner_protocol(skb, skb->protocol);
+
+               if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+                       skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+               else
+                       skb_set_inner_protocol(skb, skb->protocol);
 
                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);
@@ -3577,7 +3583,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
-       bpf_compute_data_end_sk_skb(skb);
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);
 
@@ -3742,10 +3747,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
 {
-       int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
-       bpf_compute_data_end_sk_skb(skb);
-       return ret;
+       return __bpf_skb_change_tail(skb, new_len, flags);
 }
 
 static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@ -3808,10 +3810,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
 {
-       int ret = __bpf_skb_change_head(skb, head_room, flags);
-
-       bpf_compute_data_end_sk_skb(skb);
-       return ret;
+       return __bpf_skb_change_head(skb, head_room, flags);
 }
 
 static const struct bpf_func_proto sk_skb_change_head_proto = {
@@ -3919,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
-                           struct bpf_map *map, struct xdp_buff *xdp)
-{
-       switch (map->map_type) {
-       case BPF_MAP_TYPE_DEVMAP:
-       case BPF_MAP_TYPE_DEVMAP_HASH:
-               return dev_map_enqueue(fwd, xdp, dev_rx);
-       case BPF_MAP_TYPE_CPUMAP:
-               return cpu_map_enqueue(fwd, xdp, dev_rx);
-       case BPF_MAP_TYPE_XSKMAP:
-               return __xsk_map_redirect(fwd, xdp);
-       default:
-               return -EBADRQC;
-       }
-       return 0;
-}
-
 void xdp_do_flush(void)
 {
        __dev_flush();
@@ -3944,71 +3926,52 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
-static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
-       switch (map->map_type) {
-       case BPF_MAP_TYPE_DEVMAP:
-               return __dev_map_lookup_elem(map, index);
-       case BPF_MAP_TYPE_DEVMAP_HASH:
-               return __dev_map_hash_lookup_elem(map, index);
-       case BPF_MAP_TYPE_CPUMAP:
-               return __cpu_map_lookup_elem(map, index);
-       case BPF_MAP_TYPE_XSKMAP:
-               return __xsk_map_lookup_elem(map, index);
-       default:
-               return NULL;
-       }
-}
-
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
-       struct bpf_redirect_info *ri;
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               ri = per_cpu_ptr(&bpf_redirect_info, cpu);
-               /* Avoid polluting remote cacheline due to writes if
-                * not needed. Once we pass this test, we need the
-                * cmpxchg() to make sure it hasn't been changed in
-                * the meantime by remote CPU.
-                */
-               if (unlikely(READ_ONCE(ri->map) == map))
-                       cmpxchg(&ri->map, map, NULL);
-       }
-}
-
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    struct bpf_prog *xdp_prog)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-       struct bpf_map *map = READ_ONCE(ri->map);
-       u32 index = ri->tgt_index;
+       enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
+       u32 map_id = ri->map_id;
        int err;
 
-       ri->tgt_index = 0;
-       ri->tgt_value = NULL;
-       WRITE_ONCE(ri->map, NULL);
+       ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+       ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
-       if (unlikely(!map)) {
-               fwd = dev_get_by_index_rcu(dev_net(dev), index);
-               if (unlikely(!fwd)) {
-                       err = -EINVAL;
-                       goto err;
+       switch (map_type) {
+       case BPF_MAP_TYPE_DEVMAP:
+               fallthrough;
+       case BPF_MAP_TYPE_DEVMAP_HASH:
+               err = dev_map_enqueue(fwd, xdp, dev);
+               break;
+       case BPF_MAP_TYPE_CPUMAP:
+               err = cpu_map_enqueue(fwd, xdp, dev);
+               break;
+       case BPF_MAP_TYPE_XSKMAP:
+               err = __xsk_map_redirect(fwd, xdp);
+               break;
+       case BPF_MAP_TYPE_UNSPEC:
+               if (map_id == INT_MAX) {
+                       fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+                       if (unlikely(!fwd)) {
+                               err = -EINVAL;
+                               break;
+                       }
+                       err = dev_xdp_enqueue(fwd, xdp, dev);
+                       break;
                }
-
-               err = dev_xdp_enqueue(fwd, xdp, dev);
-       } else {
-               err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+               fallthrough;
+       default:
+               err = -EBADRQC;
        }
 
        if (unlikely(err))
                goto err;
 
-       _trace_xdp_redirect_map(dev, xdp_prog, fwd, mapindex);
+       _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
 err:
-       _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, mapindex, err);
+       _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -4017,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       struct bpf_prog *xdp_prog,
-                                      struct bpf_map *map)
+                                      void *fwd,
+                                      enum bpf_map_type map_type, u32 map_id)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-       u32 index = ri->tgt_index;
-       void *fwd = ri->tgt_value;
-       int err = 0;
-
-       ri->tgt_index = 0;
-       ri->tgt_value = NULL;
-       WRITE_ONCE(ri->map, NULL);
-
-       if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
-           map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
-               struct bpf_dtab_netdev *dst = fwd;
+       int err;
 
-               err = dev_map_generic_redirect(dst, skb, xdp_prog);
+       switch (map_type) {
+       case BPF_MAP_TYPE_DEVMAP:
+               fallthrough;
+       case BPF_MAP_TYPE_DEVMAP_HASH:
+               err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                if (unlikely(err))
                        goto err;
-       } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
-               struct xdp_sock *xs = fwd;
-
-               err = xsk_generic_rcv(xs, xdp);
+               break;
+       case BPF_MAP_TYPE_XSKMAP:
+               err = xsk_generic_rcv(fwd, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
-       } else {
+               break;
+       default:
                /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
                err = -EBADRQC;
                goto err;
        }
 
-       _trace_xdp_redirect_map(dev, xdp_prog, fwd, mapindex);
+       _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
 err:
-       _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, mapindex, err);
+       _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
 }
 
@@ -4059,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-       struct bpf_map *map = READ_ONCE(ri->map);
-       u32 index = ri->tgt_index;
-       struct net_device *fwd;
-       int err = 0;
-
-       if (map)
-               return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
-                                                  map);
-       ri->tgt_index = 0;
-       fwd = dev_get_by_index_rcu(dev_net(dev), index);
-       if (unlikely(!fwd)) {
-               err = -EINVAL;
-               goto err;
-       }
+       enum bpf_map_type map_type = ri->map_type;
+       void *fwd = ri->tgt_value;
+       u32 map_id = ri->map_id;
+       int err;
 
-       err = xdp_ok_fwd_dev(fwd, skb->len);
-       if (unlikely(err))
-               goto err;
+       ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+       ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
-       skb->dev = fwd;
-       _trace_xdp_redirect(dev, xdp_prog, index);
-       generic_xdp_tx(skb, xdp_prog);
-       return 0;
+       if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+               fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+               if (unlikely(!fwd)) {
+                       err = -EINVAL;
+                       goto err;
+               }
+
+               err = xdp_ok_fwd_dev(fwd, skb->len);
+               if (unlikely(err))
+                       goto err;
+
+               skb->dev = fwd;
+               _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+               generic_xdp_tx(skb, xdp_prog);
+               return 0;
+       }
+
+       return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
 err:
-       _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+       _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
        return err;
 }
 
@@ -4094,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
        if (unlikely(flags))
                return XDP_ABORTED;
 
-       ri->flags = flags;
+       /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+        * by map_idr) is used for ifindex based XDP redirect.
+        */
        ri->tgt_index = ifindex;
-       ri->tgt_value = NULL;
-       WRITE_ONCE(ri->map, NULL);
+       ri->map_id = INT_MAX;
+       ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
        return XDP_REDIRECT;
 }
@@ -4113,28 +4076,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
           u64, flags)
 {
-       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
-       /* Lower bits of the flags are used as return code on lookup failure */
-       if (unlikely(flags > XDP_TX))
-               return XDP_ABORTED;
-
-       ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
-       if (unlikely(!ri->tgt_value)) {
-               /* If the lookup fails we want to clear out the state in the
-                * redirect_info struct completely, so that if an eBPF program
-                * performs multiple lookups, the last one always takes
-                * precedence.
-                */
-               WRITE_ONCE(ri->map, NULL);
-               return flags;
-       }
-
-       ri->flags = flags;
-       ri->tgt_index = ifindex;
-       WRITE_ONCE(ri->map, map);
-
-       return XDP_REDIRECT;
+       return map->ops->map_redirect(map, ifindex, flags);
 }
 
 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@ -9655,22 +9597,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
        return insn - insn_buf;
 }
 
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+                                                   struct bpf_insn *insn)
+{
+       /* si->dst_reg = skb->data */
+       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+                             si->dst_reg, si->src_reg,
+                             offsetof(struct sk_buff, data));
+       /* AX = skb->len */
+       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+                             BPF_REG_AX, si->src_reg,
+                             offsetof(struct sk_buff, len));
+       /* si->dst_reg = skb->data + skb->len */
+       *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+       /* AX = skb->data_len */
+       *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+                             BPF_REG_AX, si->src_reg,
+                             offsetof(struct sk_buff, data_len));
+       /* si->dst_reg = skb->data + skb->len - skb->data_len */
+       *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+       return insn;
+}
+
 static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
 {
        struct bpf_insn *insn = insn_buf;
-       int off;
 
        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
-               off  = si->off;
-               off -= offsetof(struct __sk_buff, data_end);
-               off += offsetof(struct sk_buff, cb);
-               off += offsetof(struct tcp_skb_cb, bpf.data_end);
-               *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
-                                     si->src_reg, off);
+               insn = bpf_convert_data_end_access(si, insn);
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
@@ -10449,6 +10409,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
 }
 
 const struct bpf_prog_ops sk_lookup_prog_ops = {
+       .test_run = bpf_prog_test_run_sk_lookup,
 };
 
 const struct bpf_verifier_ops sk_lookup_verifier_ops = {
index 1261512..07f5401 100644 (file)
@@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work)
                len = skb->len;
                off = 0;
 start:
-               ingress = tcp_skb_bpf_ingress(skb);
+               ingress = skb_bpf_ingress(skb);
+               skb_bpf_redirect_clear(skb);
                do {
                        ret = -EIO;
                        if (likely(psock->sk->sk_socket))
@@ -618,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
        return link;
 }
 
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 {
        struct sk_msg *msg, *tmp;
 
@@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 
 static void sk_psock_zap_ingress(struct sk_psock *psock)
 {
-       __skb_queue_purge(&psock->ingress_skb);
+       struct sk_buff *skb;
+
+       while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+               skb_bpf_redirect_clear(skb);
+               kfree_skb(skb);
+       }
        __sk_psock_purge_ingress_msg(psock);
 }
 
@@ -645,15 +651,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
        }
 }
 
+static void sk_psock_done_strp(struct sk_psock *psock);
+
 static void sk_psock_destroy_deferred(struct work_struct *gc)
 {
        struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
 
        /* No sk_callback_lock since already detached. */
 
-       /* Parser has been stopped */
-       if (psock->progs.skb_parser)
-               strp_done(&psock->parser.strp);
+       sk_psock_done_strp(psock);
 
        cancel_work_sync(&psock->work);
 
@@ -685,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
        write_lock_bh(&sk->sk_callback_lock);
        sk_psock_restore_proto(sk, psock);
        rcu_assign_sk_user_data(sk, NULL);
-       if (psock->progs.skb_parser)
+       if (psock->progs.stream_parser)
                sk_psock_stop_strp(sk, psock);
-       else if (psock->progs.skb_verdict)
+       else if (psock->progs.stream_verdict)
                sk_psock_stop_verdict(sk, psock);
        write_unlock_bh(&sk->sk_callback_lock);
        sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@@ -743,27 +749,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
 
-static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
-                           struct sk_buff *skb)
-{
-       bpf_compute_data_end_sk_skb(skb);
-       return bpf_prog_run_pin_on_cpu(prog, skb);
-}
-
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
-       struct sk_psock_parser *parser;
-
-       parser = container_of(strp, struct sk_psock_parser, strp);
-       return container_of(parser, struct sk_psock, parser);
-}
-
 static void sk_psock_skb_redirect(struct sk_buff *skb)
 {
        struct sk_psock *psock_other;
        struct sock *sk_other;
 
-       sk_other = tcp_skb_bpf_redirect_fetch(skb);
+       sk_other = skb_bpf_redirect_fetch(skb);
        /* This error is a buggy BPF program, it returned a redirect
         * return code, but then didn't set a redirect interface.
         */
@@ -806,16 +797,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
        int ret = __SK_PASS;
 
        rcu_read_lock();
-       prog = READ_ONCE(psock->progs.skb_verdict);
+       prog = READ_ONCE(psock->progs.stream_verdict);
        if (likely(prog)) {
                /* We skip full set_owner_r here because if we do a SK_PASS
                 * or SK_DROP we can skip skb memory accounting and use the
                 * TLS context.
                 */
                skb->sk = psock->sk;
-               tcp_skb_bpf_redirect_clear(skb);
-               ret = sk_psock_bpf_run(psock, prog, skb);
-               ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+               skb_dst_drop(skb);
+               skb_bpf_redirect_clear(skb);
+               ret = bpf_prog_run_pin_on_cpu(prog, skb);
+               ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
                skb->sk = NULL;
        }
        sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -827,7 +819,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
 static void sk_psock_verdict_apply(struct sk_psock *psock,
                                   struct sk_buff *skb, int verdict)
 {
-       struct tcp_skb_cb *tcp;
        struct sock *sk_other;
        int err = -EIO;
 
@@ -839,8 +830,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
                        goto out_free;
                }
 
-               tcp = TCP_SKB_CB(skb);
-               tcp->bpf.flags |= BPF_F_INGRESS;
+               skb_bpf_set_ingress(skb);
 
                /* If the queue is empty then we can submit directly
                 * into the msg queue. If its not empty we have to
@@ -866,6 +856,24 @@ out_free:
        }
 }
 
+static void sk_psock_write_space(struct sock *sk)
+{
+       struct sk_psock *psock;
+       void (*write_space)(struct sock *sk) = NULL;
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (likely(psock)) {
+               if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+                       schedule_work(&psock->work);
+               write_space = psock->saved_write_space;
+       }
+       rcu_read_unlock();
+       if (write_space)
+               write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 {
        struct sk_psock *psock;
@@ -881,11 +889,12 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
                goto out;
        }
        skb_set_owner_r(skb, sk);
-       prog = READ_ONCE(psock->progs.skb_verdict);
+       prog = READ_ONCE(psock->progs.stream_verdict);
        if (likely(prog)) {
-               tcp_skb_bpf_redirect_clear(skb);
-               ret = sk_psock_bpf_run(psock, prog, skb);
-               ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+               skb_dst_drop(skb);
+               skb_bpf_redirect_clear(skb);
+               ret = bpf_prog_run_pin_on_cpu(prog, skb);
+               ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
        }
        sk_psock_verdict_apply(psock, skb, ret);
 out:
@@ -899,15 +908,15 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
 
 static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 {
-       struct sk_psock *psock = sk_psock_from_strp(strp);
+       struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
        struct bpf_prog *prog;
        int ret = skb->len;
 
        rcu_read_lock();
-       prog = READ_ONCE(psock->progs.skb_parser);
+       prog = READ_ONCE(psock->progs.stream_parser);
        if (likely(prog)) {
                skb->sk = psock->sk;
-               ret = sk_psock_bpf_run(psock, prog, skb);
+               ret = bpf_prog_run_pin_on_cpu(prog, skb);
                skb->sk = NULL;
        }
        rcu_read_unlock();
@@ -923,16 +932,59 @@ static void sk_psock_strp_data_ready(struct sock *sk)
        psock = sk_psock(sk);
        if (likely(psock)) {
                if (tls_sw_has_ctx_rx(sk)) {
-                       psock->parser.saved_data_ready(sk);
+                       psock->saved_data_ready(sk);
                } else {
                        write_lock_bh(&sk->sk_callback_lock);
-                       strp_data_ready(&psock->parser.strp);
+                       strp_data_ready(&psock->strp);
                        write_unlock_bh(&sk->sk_callback_lock);
                }
        }
        rcu_read_unlock();
 }
 
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+       static const struct strp_callbacks cb = {
+               .rcv_msg        = sk_psock_strp_read,
+               .read_sock_done = sk_psock_strp_read_done,
+               .parse_msg      = sk_psock_strp_parse,
+       };
+
+       return strp_init(&psock->strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+       if (psock->saved_data_ready)
+               return;
+
+       psock->saved_data_ready = sk->sk_data_ready;
+       sk->sk_data_ready = sk_psock_strp_data_ready;
+       sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+       if (!psock->saved_data_ready)
+               return;
+
+       sk->sk_data_ready = psock->saved_data_ready;
+       psock->saved_data_ready = NULL;
+       strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+       /* Parser has been stopped */
+       if (psock->progs.stream_parser)
+               strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
 static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
                                 unsigned int offset, size_t orig_len)
 {
@@ -957,11 +1009,12 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
                goto out;
        }
        skb_set_owner_r(skb, sk);
-       prog = READ_ONCE(psock->progs.skb_verdict);
+       prog = READ_ONCE(psock->progs.stream_verdict);
        if (likely(prog)) {
-               tcp_skb_bpf_redirect_clear(skb);
-               ret = sk_psock_bpf_run(psock, prog, skb);
-               ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+               skb_dst_drop(skb);
+               skb_bpf_redirect_clear(skb);
+               ret = bpf_prog_run_pin_on_cpu(prog, skb);
+               ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
        }
        sk_psock_verdict_apply(psock, skb, ret);
 out:
@@ -984,82 +1037,21 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
        sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
 }
 
-static void sk_psock_write_space(struct sock *sk)
-{
-       struct sk_psock *psock;
-       void (*write_space)(struct sock *sk) = NULL;
-
-       rcu_read_lock();
-       psock = sk_psock(sk);
-       if (likely(psock)) {
-               if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
-                       schedule_work(&psock->work);
-               write_space = psock->saved_write_space;
-       }
-       rcu_read_unlock();
-       if (write_space)
-               write_space(sk);
-}
-
-int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
-{
-       static const struct strp_callbacks cb = {
-               .rcv_msg        = sk_psock_strp_read,
-               .read_sock_done = sk_psock_strp_read_done,
-               .parse_msg      = sk_psock_strp_parse,
-       };
-
-       psock->parser.enabled = false;
-       return strp_init(&psock->parser.strp, sk, &cb);
-}
-
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
 {
-       struct sk_psock_parser *parser = &psock->parser;
-
-       if (parser->enabled)
+       if (psock->saved_data_ready)
                return;
 
-       parser->saved_data_ready = sk->sk_data_ready;
+       psock->saved_data_ready = sk->sk_data_ready;
        sk->sk_data_ready = sk_psock_verdict_data_ready;
        sk->sk_write_space = sk_psock_write_space;
-       parser->enabled = true;
-}
-
-void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
-{
-       struct sk_psock_parser *parser = &psock->parser;
-
-       if (parser->enabled)
-               return;
-
-       parser->saved_data_ready = sk->sk_data_ready;
-       sk->sk_data_ready = sk_psock_strp_data_ready;
-       sk->sk_write_space = sk_psock_write_space;
-       parser->enabled = true;
-}
-
-void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
-{
-       struct sk_psock_parser *parser = &psock->parser;
-
-       if (!parser->enabled)
-               return;
-
-       sk->sk_data_ready = parser->saved_data_ready;
-       parser->saved_data_ready = NULL;
-       strp_stop(&parser->strp);
-       parser->enabled = false;
 }
 
 void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
 {
-       struct sk_psock_parser *parser = &psock->parser;
-
-       if (!parser->enabled)
+       if (!psock->saved_data_ready)
                return;
 
-       sk->sk_data_ready = parser->saved_data_ready;
-       parser->saved_data_ready = NULL;
-       parser->enabled = false;
+       sk->sk_data_ready = psock->saved_data_ready;
+       psock->saved_data_ready = NULL;
 }
index d758fb8..dd53a77 100644 (file)
@@ -24,6 +24,9 @@ struct bpf_stab {
 #define SOCK_CREATE_FLAG_MASK                          \
        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+                               struct bpf_prog *old, u32 which);
+
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {
        struct bpf_stab *stab;
@@ -148,9 +151,9 @@ static void sock_map_del_link(struct sock *sk,
                        struct bpf_map *map = link->map;
                        struct bpf_stab *stab = container_of(map, struct bpf_stab,
                                                             map);
-                       if (psock->parser.enabled && stab->progs.skb_parser)
+                       if (psock->saved_data_ready && stab->progs.stream_parser)
                                strp_stop = true;
-                       if (psock->parser.enabled && stab->progs.skb_verdict)
+                       if (psock->saved_data_ready && stab->progs.stream_verdict)
                                verdict_stop = true;
                        list_del(&link->list);
                        sk_psock_free_link(link);
@@ -224,23 +227,23 @@ out:
 static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
                         struct sock *sk)
 {
-       struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+       struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
        struct sk_psock *psock;
        int ret;
 
-       skb_verdict = READ_ONCE(progs->skb_verdict);
-       if (skb_verdict) {
-               skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
-               if (IS_ERR(skb_verdict))
-                       return PTR_ERR(skb_verdict);
+       stream_verdict = READ_ONCE(progs->stream_verdict);
+       if (stream_verdict) {
+               stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+               if (IS_ERR(stream_verdict))
+                       return PTR_ERR(stream_verdict);
        }
 
-       skb_parser = READ_ONCE(progs->skb_parser);
-       if (skb_parser) {
-               skb_parser = bpf_prog_inc_not_zero(skb_parser);
-               if (IS_ERR(skb_parser)) {
-                       ret = PTR_ERR(skb_parser);
-                       goto out_put_skb_verdict;
+       stream_parser = READ_ONCE(progs->stream_parser);
+       if (stream_parser) {
+               stream_parser = bpf_prog_inc_not_zero(stream_parser);
+               if (IS_ERR(stream_parser)) {
+                       ret = PTR_ERR(stream_parser);
+                       goto out_put_stream_verdict;
                }
        }
 
@@ -249,7 +252,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
                msg_parser = bpf_prog_inc_not_zero(msg_parser);
                if (IS_ERR(msg_parser)) {
                        ret = PTR_ERR(msg_parser);
-                       goto out_put_skb_parser;
+                       goto out_put_stream_parser;
                }
        }
 
@@ -261,8 +264,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 
        if (psock) {
                if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
-                   (skb_parser  && READ_ONCE(psock->progs.skb_parser)) ||
-                   (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
+                   (stream_parser  && READ_ONCE(psock->progs.stream_parser)) ||
+                   (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
                        sk_psock_put(sk, psock);
                        ret = -EBUSY;
                        goto out_progs;
@@ -283,15 +286,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
                goto out_drop;
 
        write_lock_bh(&sk->sk_callback_lock);
-       if (skb_parser && skb_verdict && !psock->parser.enabled) {
+       if (stream_parser && stream_verdict && !psock->saved_data_ready) {
                ret = sk_psock_init_strp(sk, psock);
                if (ret)
                        goto out_unlock_drop;
-               psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
-               psock_set_prog(&psock->progs.skb_parser, skb_parser);
+               psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+               psock_set_prog(&psock->progs.stream_parser, stream_parser);
                sk_psock_start_strp(sk, psock);
-       } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
-               psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+       } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+               psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
                sk_psock_start_verdict(sk,psock);
        }
        write_unlock_bh(&sk->sk_callback_lock);
@@ -303,12 +306,12 @@ out_drop:
 out_progs:
        if (msg_parser)
                bpf_prog_put(msg_parser);
-out_put_skb_parser:
-       if (skb_parser)
-               bpf_prog_put(skb_parser);
-out_put_skb_verdict:
-       if (skb_verdict)
-               bpf_prog_put(skb_verdict);
+out_put_stream_parser:
+       if (stream_parser)
+               bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+       if (stream_verdict)
+               bpf_prog_put(stream_verdict);
        return ret;
 }
 
@@ -657,7 +660,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
           struct bpf_map *, map, u32, key, u64, flags)
 {
-       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
        struct sock *sk;
 
        if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -667,8 +669,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
        if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
                return SK_DROP;
 
-       tcb->bpf.flags = flags;
-       tcb->bpf.sk_redir = sk;
+       skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
        return SK_PASS;
 }
 
@@ -1250,7 +1251,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
 BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
           struct bpf_map *, map, void *, key, u64, flags)
 {
-       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
        struct sock *sk;
 
        if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1260,8 +1260,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
        if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
                return SK_DROP;
 
-       tcb->bpf.flags = flags;
-       tcb->bpf.sk_redir = sk;
+       skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
        return SK_PASS;
 }
 
@@ -1448,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
        return NULL;
 }
 
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-                        struct bpf_prog *old, u32 which)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+                               struct bpf_prog *old, u32 which)
 {
        struct sk_psock_progs *progs = sock_map_progs(map);
        struct bpf_prog **pprog;
@@ -1461,11 +1460,13 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
        case BPF_SK_MSG_VERDICT:
                pprog = &progs->msg_parser;
                break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
        case BPF_SK_SKB_STREAM_PARSER:
-               pprog = &progs->skb_parser;
+               pprog = &progs->stream_parser;
                break;
+#endif
        case BPF_SK_SKB_STREAM_VERDICT:
-               pprog = &progs->skb_verdict;
+               pprog = &progs->stream_verdict;
                break;
        default:
                return -EOPNOTSUPP;
index 5b77a46..bbdd9c4 100644 (file)
@@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
index bc7d2a5..17c322b 100644 (file)
@@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
 }
 EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
 static bool tcp_bpf_stream_read(const struct sock *sk)
 {
        struct sk_psock *psock;
@@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
        if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
                newsk->sk_prot = sk->sk_prot_creator;
 }
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
index 4faabd1..a71ed66 100644 (file)
@@ -445,6 +445,97 @@ static void xsk_destruct_skb(struct sk_buff *skb)
        sock_wfree(skb);
 }
 
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+                                             struct xdp_desc *desc)
+{
+       struct xsk_buff_pool *pool = xs->pool;
+       u32 hr, len, ts, offset, copy, copied;
+       struct sk_buff *skb;
+       struct page *page;
+       void *buffer;
+       int err, i;
+       u64 addr;
+
+       hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+
+       skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+       if (unlikely(!skb))
+               return ERR_PTR(err);
+
+       skb_reserve(skb, hr);
+
+       addr = desc->addr;
+       len = desc->len;
+       ts = pool->unaligned ? len : pool->chunk_size;
+
+       buffer = xsk_buff_raw_get_data(pool, addr);
+       offset = offset_in_page(buffer);
+       addr = buffer - pool->addrs;
+
+       for (copied = 0, i = 0; copied < len; i++) {
+               page = pool->umem->pgs[addr >> PAGE_SHIFT];
+               get_page(page);
+
+               copy = min_t(u32, PAGE_SIZE - offset, len - copied);
+               skb_fill_page_desc(skb, i, page, offset, copy);
+
+               copied += copy;
+               addr += copy;
+               offset = 0;
+       }
+
+       skb->len += len;
+       skb->data_len += len;
+       skb->truesize += ts;
+
+       refcount_add(ts, &xs->sk.sk_wmem_alloc);
+
+       return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+                                    struct xdp_desc *desc)
+{
+       struct net_device *dev = xs->dev;
+       struct sk_buff *skb;
+
+       if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
+               skb = xsk_build_skb_zerocopy(xs, desc);
+               if (IS_ERR(skb))
+                       return skb;
+       } else {
+               u32 hr, tr, len;
+               void *buffer;
+               int err;
+
+               hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+               tr = dev->needed_tailroom;
+               len = desc->len;
+
+               skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+               if (unlikely(!skb))
+                       return ERR_PTR(err);
+
+               skb_reserve(skb, hr);
+               skb_put(skb, len);
+
+               buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+               err = skb_store_bits(skb, 0, buffer, len);
+               if (unlikely(err)) {
+                       kfree_skb(skb);
+                       return ERR_PTR(err);
+               }
+       }
+
+       skb->dev = dev;
+       skb->priority = xs->sk.sk_priority;
+       skb->mark = xs->sk.sk_mark;
+       skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
+       skb->destructor = xsk_destruct_skb;
+
+       return skb;
+}
+
 static int xsk_generic_xmit(struct sock *sk)
 {
        struct xdp_sock *xs = xdp_sk(sk);
@@ -461,43 +552,30 @@ static int xsk_generic_xmit(struct sock *sk)
                goto out;
 
        while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
-               char *buffer;
-               u64 addr;
-               u32 len;
-
                if (max_batch-- == 0) {
                        err = -EAGAIN;
                        goto out;
                }
 
-               len = desc.len;
-               skb = sock_alloc_send_skb(sk, len, 1, &err);
-               if (unlikely(!skb))
+               skb = xsk_build_skb(xs, &desc);
+               if (IS_ERR(skb)) {
+                       err = PTR_ERR(skb);
                        goto out;
+               }
 
-               skb_put(skb, len);
-               addr = desc.addr;
-               buffer = xsk_buff_raw_get_data(xs->pool, addr);
-               err = skb_store_bits(skb, 0, buffer, len);
                /* This is the backpressure mechanism for the Tx path.
                 * Reserve space in the completion queue and only proceed
                 * if there is space in it. This avoids having to implement
                 * any buffering in the Tx path.
                 */
                spin_lock_irqsave(&xs->pool->cq_lock, flags);
-               if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+               if (xskq_prod_reserve(xs->pool->cq)) {
                        spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
                        kfree_skb(skb);
                        goto out;
                }
                spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
-               skb->dev = xs->dev;
-               skb->priority = sk->sk_priority;
-               skb->mark = sk->sk_mark;
-               skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
-               skb->destructor = xsk_destruct_skb;
-
                err = __dev_direct_xmit(skb, xs->queue_id);
                if  (err == NETDEV_TX_BUSY) {
                        /* Tell user-space to retry the send */
index 2823b7c..2ac3802 100644 (file)
@@ -47,19 +47,18 @@ struct xsk_queue {
        u64 queue_empty_descs;
 };
 
-/* The structure of the shared state of the rings are the same as the
- * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion
- * ring, the kernel is the producer and user space is the consumer. For
- * the Tx and fill rings, the kernel is the consumer and user space is
- * the producer.
+/* The structure of the shared state of the rings are a simple
+ * circular buffer, as outlined in
+ * Documentation/core-api/circular-buffers.rst. For the Rx and
+ * completion ring, the kernel is the producer and user space is the
+ * consumer. For the Tx and fill rings, the kernel is the consumer and
+ * user space is the producer.
  *
  * producer                         consumer
  *
- * if (LOAD ->consumer) {           LOAD ->producer
- *                    (A)           smp_rmb()       (C)
+ * if (LOAD ->consumer) {  (A)      LOAD.acq ->producer  (C)
  *    STORE $data                   LOAD $data
- *    smp_wmb()       (B)           smp_mb()        (D)
- *    STORE ->producer              STORE ->consumer
+ *    STORE.rel ->producer (B)      STORE.rel ->consumer (D)
  * }
  *
  * (A) pairs with (D), and (B) pairs with (C).
@@ -78,7 +77,8 @@ struct xsk_queue {
  *
  * (A) is a control dependency that separates the load of ->consumer
  * from the stores of $data. In case ->consumer indicates there is no
- * room in the buffer to store $data we do not. So no barrier is needed.
+ * room in the buffer to store $data we do not. The dependency will
+ * order both of the stores after the loads. So no barrier is needed.
  *
  * (D) protects the load of the data to be observed to happen after the
  * store of the consumer pointer. If we did not have this memory
@@ -227,15 +227,13 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
 
 static inline void __xskq_cons_release(struct xsk_queue *q)
 {
-       smp_mb(); /* D, matches A */
-       WRITE_ONCE(q->ring->consumer, q->cached_cons);
+       smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */
 }
 
 static inline void __xskq_cons_peek(struct xsk_queue *q)
 {
        /* Refresh the local pointer */
-       q->cached_prod = READ_ONCE(q->ring->producer);
-       smp_rmb(); /* C, matches B */
+       q->cached_prod = smp_load_acquire(&q->ring->producer);  /* C, matches B */
 }
 
 static inline void xskq_cons_get_entries(struct xsk_queue *q)
@@ -397,9 +395,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
 
 static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx)
 {
-       smp_wmb(); /* B, matches C */
-
-       WRITE_ONCE(q->ring->producer, idx);
+       smp_store_release(&q->ring->producer, idx); /* B, matches C */
 }
 
 static inline void xskq_prod_submit(struct xsk_queue *q)
index 113fd90..67b4ce5 100644 (file)
@@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map)
 {
        struct xsk_map *m = container_of(map, struct xsk_map, map);
 
-       bpf_clear_redirect_map(map);
        synchronize_net();
        bpf_map_area_free(m);
 }
@@ -125,6 +124,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
        return insn - insn_buf;
 }
 
+static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+       struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+       if (key >= map->max_entries)
+               return NULL;
+
+       return READ_ONCE(m->xsk_map[key]);
+}
+
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 {
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -215,6 +224,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
        return 0;
 }
 
+static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+       return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
+}
+
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
                             struct xdp_sock **map_entry)
 {
@@ -247,4 +261,5 @@ const struct bpf_map_ops xsk_map_ops = {
        .map_check_btf = map_check_no_btf,
        .map_btf_name = "xsk_map",
        .map_btf_id = &xsk_map_btf_id,
+       .map_redirect = xsk_map_redirect,
 };
similarity index 82%
rename from scripts/bpf_helpers_doc.py
rename to scripts/bpf_doc.py
index 867ada2..2d94025 100755 (executable)
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # Copyright (C) 2018-2019 Netronome Systems, Inc.
+# Copyright (C) 2021 Isovalent, Inc.
 
 # In case user attempts to run with Python 2.
 from __future__ import print_function
@@ -13,6 +14,9 @@ import sys, os
 class NoHelperFound(BaseException):
     pass
 
+class NoSyscallCommandFound(BaseException):
+    pass
+
 class ParsingError(BaseException):
     def __init__(self, line='<line not provided>', reader=None):
         if reader:
@@ -22,18 +26,27 @@ class ParsingError(BaseException):
         else:
             BaseException.__init__(self, 'Error parsing line: %s' % line)
 
-class Helper(object):
+
+class APIElement(object):
     """
-    An object representing the description of an eBPF helper function.
-    @proto: function prototype of the helper function
-    @desc: textual description of the helper function
-    @ret: description of the return value of the helper function
+    An object representing the description of an aspect of the eBPF API.
+    @proto: prototype of the API symbol
+    @desc: textual description of the symbol
+    @ret: (optional) description of any associated return value
     """
     def __init__(self, proto='', desc='', ret=''):
         self.proto = proto
         self.desc = desc
         self.ret = ret
 
+
+class Helper(APIElement):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
     def proto_break_down(self):
         """
         Break down helper function protocol into smaller chunks: return type,
@@ -60,6 +73,7 @@ class Helper(object):
 
         return res
 
+
 class HeaderParser(object):
     """
     An object used to parse a file in order to extract the documentation of a
@@ -72,6 +86,13 @@ class HeaderParser(object):
         self.reader = open(filename, 'r')
         self.line = ''
         self.helpers = []
+        self.commands = []
+
+    def parse_element(self):
+        proto    = self.parse_symbol()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return APIElement(proto=proto, desc=desc, ret=ret)
 
     def parse_helper(self):
         proto    = self.parse_proto()
@@ -79,6 +100,18 @@ class HeaderParser(object):
         ret      = self.parse_ret()
         return Helper(proto=proto, desc=desc, ret=ret)
 
+    def parse_symbol(self):
+        p = re.compile(' \* ?(.+)$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoSyscallCommandFound
+        end_re = re.compile(' \* ?NOTES$')
+        end = end_re.match(self.line)
+        if end:
+            raise NoSyscallCommandFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
     def parse_proto(self):
         # Argument can be of shape:
         #   - "void"
@@ -140,16 +173,29 @@ class HeaderParser(object):
                     break
         return ret
 
-    def run(self):
-        # Advance to start of helper function descriptions.
-        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+    def seek_to(self, target, help_message):
+        self.reader.seek(0)
+        offset = self.reader.read().find(target)
         if offset == -1:
-            raise Exception('Could not find start of eBPF helper descriptions list')
+            raise Exception(help_message)
         self.reader.seek(offset)
         self.reader.readline()
         self.reader.readline()
         self.line = self.reader.readline()
 
+    def parse_syscall(self):
+        self.seek_to('* DOC: eBPF Syscall Commands',
+                     'Could not find start of eBPF syscall descriptions list')
+        while True:
+            try:
+                command = self.parse_element()
+                self.commands.append(command)
+            except NoSyscallCommandFound:
+                break
+
+    def parse_helpers(self):
+        self.seek_to('* Start of BPF helper function descriptions:',
+                     'Could not find start of eBPF helper descriptions list')
         while True:
             try:
                 helper = self.parse_helper()
@@ -157,6 +203,9 @@ class HeaderParser(object):
             except NoHelperFound:
                 break
 
+    def run(self):
+        self.parse_syscall()
+        self.parse_helpers()
         self.reader.close()
 
 ###############################################################################
@@ -165,10 +214,11 @@ class Printer(object):
     """
     A generic class for printers. Printers should be created with an array of
     Helper objects, and implement a way to print them in the desired fashion.
-    @helpers: array of Helper objects to print to standard output
+    @parser: A HeaderParser with objects to print to standard output
     """
-    def __init__(self, helpers):
-        self.helpers = helpers
+    def __init__(self, parser):
+        self.parser = parser
+        self.elements = []
 
     def print_header(self):
         pass
@@ -181,19 +231,23 @@ class Printer(object):
 
     def print_all(self):
         self.print_header()
-        for helper in self.helpers:
-            self.print_one(helper)
+        for elem in self.elements:
+            self.print_one(elem)
         self.print_footer()
 
+
 class PrinterRST(Printer):
     """
-    A printer for dumping collected information about helpers as a ReStructured
-    Text page compatible with the rst2man program, which can be used to
-    generate a manual page for the helpers.
-    @helpers: array of Helper objects to print to standard output
+    A generic class for printers that print ReStructured Text. Printers should
+    be created with a HeaderParser object, and implement a way to print API
+    elements in the desired fashion.
+    @parser: A HeaderParser with objects to print to standard output
     """
-    def print_header(self):
-        header = '''\
+    def __init__(self, parser):
+        self.parser = parser
+
+    def print_license(self):
+        license = '''\
 .. Copyright (C) All BPF authors and contributors from 2014 to present.
 .. See git log include/uapi/linux/bpf.h in kernel tree for details.
 .. 
@@ -221,9 +275,39 @@ class PrinterRST(Printer):
 .. 
 .. Please do not edit this file. It was generated from the documentation
 .. located in file include/uapi/linux/bpf.h of the Linux kernel sources
-.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. (helpers description), and from scripts/bpf_doc.py in the same
 .. repository (header and footer).
+'''
+        print(license)
 
+    def print_elem(self, elem):
+        if (elem.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', elem.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (elem.ret):
+            print('\tReturn')
+            for line in elem.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+
+class PrinterHelpersRST(PrinterRST):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @parser: A HeaderParser with Helper objects to print to standard output
+    """
+    def __init__(self, parser):
+        self.elements = parser.helpers
+
+    def print_header(self):
+        header = '''\
 ===========
 BPF-HELPERS
 ===========
@@ -264,6 +348,7 @@ kernel at the top).
 HELPERS
 =======
 '''
+        PrinterRST.print_license(self)
         print(header)
 
     def print_footer(self):
@@ -380,27 +465,50 @@ SEE ALSO
 
     def print_one(self, helper):
         self.print_proto(helper)
+        self.print_elem(helper)
 
-        if (helper.desc):
-            print('\tDescription')
-            # Do not strip all newline characters: formatted code at the end of
-            # a section must be followed by a blank line.
-            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
-                print('{}{}'.format('\t\t' if line else '', line))
 
-        if (helper.ret):
-            print('\tReturn')
-            for line in helper.ret.rstrip().split('\n'):
-                print('{}{}'.format('\t\t' if line else '', line))
+class PrinterSyscallRST(PrinterRST):
+    """
+    A printer for dumping collected information about the syscall API as a
+    ReStructured Text page compatible with the rst2man program, which can be
+    used to generate a manual page for the syscall.
+    @parser: A HeaderParser with APIElement objects to print to standard
+             output
+    """
+    def __init__(self, parser):
+        self.elements = parser.commands
+
+    def print_header(self):
+        header = '''\
+===
+bpf
+===
+-------------------------------------------------------------------------------
+Perform a command on an extended BPF object
+-------------------------------------------------------------------------------
+
+:Manual section: 2
+
+COMMANDS
+========
+'''
+        PrinterRST.print_license(self)
+        print(header)
+
+    def print_one(self, command):
+        print('**%s**' % (command.proto))
+        self.print_elem(command)
 
-        print('')
 
 class PrinterHelpers(Printer):
     """
     A printer for dumping collected information about helpers as C header to
     be included from BPF program.
-    @helpers: array of Helper objects to print to standard output
+    @parser: A HeaderParser with Helper objects to print to standard output
     """
+    def __init__(self, parser):
+        self.elements = parser.helpers
 
     type_fwds = [
             'struct bpf_fib_lookup',
@@ -511,7 +619,7 @@ class PrinterHelpers(Printer):
 
     def print_header(self):
         header = '''\
-/* This is auto-generated file. See bpf_helpers_doc.py for details. */
+/* This is auto-generated file. See bpf_doc.py for details. */
 
 /* Forward declarations of BPF structs */'''
 
@@ -589,8 +697,13 @@ script = os.path.abspath(sys.argv[0])
 linuxRoot = os.path.dirname(os.path.dirname(script))
 bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
 
+printers = {
+        'helpers': PrinterHelpersRST,
+        'syscall': PrinterSyscallRST,
+}
+
 argParser = argparse.ArgumentParser(description="""
-Parse eBPF header file and generate documentation for eBPF helper functions.
+Parse eBPF header file and generate documentation for the eBPF API.
 The RST-formatted output produced can be turned into a manual page with the
 rst2man utility.
 """)
@@ -601,6 +714,8 @@ if (os.path.isfile(bpfh)):
                            default=bpfh)
 else:
     argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+argParser.add_argument('target', nargs='?', default='helpers',
+                       choices=printers.keys(), help='eBPF API target')
 args = argParser.parse_args()
 
 # Parse file.
@@ -609,7 +724,9 @@ headerParser.run()
 
 # Print formatted output to standard output.
 if args.header:
-    printer = PrinterHelpers(headerParser.helpers)
+    if args.target != 'helpers':
+        raise NotImplementedError('Only helpers header generation is supported')
+    printer = PrinterHelpers(headerParser)
 else:
-    printer = PrinterRST(headerParser.helpers)
+    printer = printers[args.target](headerParser)
 printer.print_all()
diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers
deleted file mode 100644 (file)
index 854d084..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-ifndef allow-override
-  include ../scripts/Makefile.include
-  include ../scripts/utilities.mak
-else
-  # Assume Makefile.helpers is being run from bpftool/Documentation
-  # subdirectory. Go up two more directories to fetch bpf.h header and
-  # associated script.
-  UP2DIR := ../../
-endif
-
-INSTALL ?= install
-RM ?= rm -f
-RMDIR ?= rmdir --ignore-fail-on-non-empty
-
-ifeq ($(V),1)
-  Q =
-else
-  Q = @
-endif
-
-prefix ?= /usr/local
-mandir ?= $(prefix)/man
-man7dir = $(mandir)/man7
-
-HELPERS_RST = bpf-helpers.rst
-MAN7_RST = $(HELPERS_RST)
-
-_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST))
-DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7))
-
-helpers: man7
-man7: $(DOC_MAN7)
-
-RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
-
-$(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h
-       $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@
-
-$(OUTPUT)%.7: $(OUTPUT)%.rst
-ifndef RST2MAN_DEP
-       $(error "rst2man not found, but required to generate man pages")
-endif
-       $(QUIET_GEN)rst2man $< > $@
-
-helpers-clean:
-       $(call QUIET_CLEAN, eBPF_helpers-manpage)
-       $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST)
-
-helpers-install: helpers
-       $(call QUIET_INSTALL, eBPF_helpers-manpage)
-       $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
-       $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
-
-helpers-uninstall:
-       $(call QUIET_UNINST, eBPF_helpers-manpage)
-       $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7))
-       $(Q)$(RMDIR) $(DESTDIR)$(man7dir)
-
-.PHONY: helpers helpers-clean helpers-install helpers-uninstall
index a07dfc4..00e560a 100644 (file)
@@ -1198,7 +1198,7 @@ static int cmd_run(char *num)
                else
                        return CMD_OK;
                bpf_reset();
-       } while (pcap_next_pkt() && (!has_limit || (has_limit && ++i < pkts)));
+       } while (pcap_next_pkt() && (!has_limit || (++i < pkts)));
 
        rl_printf("bpf passes:%u fails:%u\n", pass, fail);
 
index 8d48e89..dfb7254 100644 (file)
@@ -185,13 +185,13 @@ ldx
        | OP_LDXB number '*' '(' '[' number ']' '&' number ')' {
                if ($2 != 4 || $9 != 0xf) {
                        fprintf(stderr, "ldxb offset not supported!\n");
-                       exit(0);
+                       exit(1);
                } else {
                        bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } }
        | OP_LDX number '*' '(' '[' number ']' '&' number ')' {
                if ($2 != 4 || $9 != 0xf) {
                        fprintf(stderr, "ldxb offset not supported!\n");
-                       exit(0);
+                       exit(1);
                } else {
                        bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } }
        ;
@@ -472,7 +472,7 @@ static void bpf_assert_max(void)
 {
        if (curr_instr >= BPF_MAXINSNS) {
                fprintf(stderr, "only max %u insns allowed!\n", BPF_MAXINSNS);
-               exit(0);
+               exit(1);
        }
 }
 
@@ -522,7 +522,7 @@ static int bpf_find_insns_offset(const char *label)
 
        if (ret == -ENOENT) {
                fprintf(stderr, "no such label \'%s\'!\n", label);
-               exit(0);
+               exit(1);
        }
 
        return ret;
@@ -549,9 +549,11 @@ static uint8_t bpf_encode_jt_jf_offset(int off, int i)
 {
        int delta = off - i - 1;
 
-       if (delta < 0 || delta > 255)
-               fprintf(stderr, "warning: insn #%d jumps to insn #%d, "
+       if (delta < 0 || delta > 255) {
+               fprintf(stderr, "error: insn #%d jumps to insn #%d, "
                                "which is out of range\n", i, off);
+               exit(1);
+       }
        return (uint8_t) delta;
 }
 
index 944cb4b..05ce444 100644 (file)
@@ -3,7 +3,6 @@
 /bootstrap/
 /bpftool
 bpftool*.8
-bpf-helpers.*
 FEATURE-DUMP.bpftool
 feature
 libbpf
index f33cb02..c494879 100644 (file)
@@ -16,15 +16,12 @@ prefix ?= /usr/local
 mandir ?= $(prefix)/man
 man8dir = $(mandir)/man8
 
-# Load targets for building eBPF helpers man page.
-include ../../Makefile.helpers
-
 MAN8_RST = $(wildcard bpftool*.rst)
 
 _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST))
 DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8))
 
-man: man8 helpers
+man: man8
 man8: $(DOC_MAN8)
 
 RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
@@ -46,16 +43,16 @@ ifndef RST2MAN_DEP
 endif
        $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@
 
-clean: helpers-clean
+clean:
        $(call QUIET_CLEAN, Documentation)
        $(Q)$(RM) $(DOC_MAN8)
 
-install: man helpers-install
+install: man
        $(call QUIET_INSTALL, Documentation-man)
        $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir)
        $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir)
 
-uninstall: helpers-uninstall
+uninstall:
        $(call QUIET_UNINST, Documentation-man)
        $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8))
        $(Q)$(RMDIR) $(DESTDIR)$(man8dir)
index fe9e7b3..985610c 100644 (file)
@@ -36,6 +36,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
        [BTF_KIND_FUNC_PROTO]   = "FUNC_PROTO",
        [BTF_KIND_VAR]          = "VAR",
        [BTF_KIND_DATASEC]      = "DATASEC",
+       [BTF_KIND_FLOAT]        = "FLOAT",
 };
 
 struct btf_attach_table {
@@ -327,6 +328,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
                        jsonw_end_array(w);
                break;
        }
+       case BTF_KIND_FLOAT: {
+               if (json_output)
+                       jsonw_uint_field(w, "size", t->size);
+               else
+                       printf(" size=%u", t->size);
+               break;
+       }
        default:
                break;
        }
index 0e93107..7ca54d0 100644 (file)
@@ -596,6 +596,7 @@ static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id,
        switch (BTF_INFO_KIND(t->info)) {
        case BTF_KIND_INT:
        case BTF_KIND_TYPEDEF:
+       case BTF_KIND_FLOAT:
                BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off));
                break;
        case BTF_KIND_STRUCT:
index 359960a..40a88df 100644 (file)
@@ -336,6 +336,10 @@ static void probe_kernel_image_config(const char *define_prefix)
                { "CONFIG_BPF_JIT", },
                /* Avoid compiling eBPF interpreter (use JIT only) */
                { "CONFIG_BPF_JIT_ALWAYS_ON", },
+               /* Kernel BTF debug information available */
+               { "CONFIG_DEBUG_INFO_BTF", },
+               /* Kernel module BTF debug information available */
+               { "CONFIG_DEBUG_INFO_BTF_MODULES", },
 
                /* cgroups */
                { "CONFIG_CGROUPS", },
index 8608cd6..6fc3e6f 100644 (file)
@@ -196,6 +196,9 @@ static const char *print_imm(void *private_data,
        else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE)
                snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
                         "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm);
+       else if (insn->src_reg == BPF_PSEUDO_FUNC)
+               snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+                        "subprog[%+d]", insn->imm);
        else
                snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
                         "0x%llx", (unsigned long long)full_imm);
index 9d9fb62..3818ec5 100644 (file)
@@ -16,7 +16,10 @@ CFLAGS := -g -Wall
 
 # Try to detect best kernel BTF source
 KERNEL_REL := $(shell uname -r)
-VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL)
+VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux)           \
+       $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
+       ../../../vmlinux /sys/kernel/btf/vmlinux        \
+       /boot/vmlinux-$(KERNEL_REL)
 VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword                           \
                                          $(wildcard $(VMLINUX_BTF_PATHS))))
 
@@ -66,12 +69,16 @@ $(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT):
        $(QUIET_MKDIR)mkdir -p $@
 
 $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL)
+ifeq ($(VMLINUX_H),)
        $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \
                echo "Couldn't find kernel BTF; set VMLINUX_BTF to"            \
                        "specify its location." >&2;                           \
                exit 1;\
        fi
        $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@
+else
+       $(Q)cp "$(VMLINUX_H)" $@
+endif
 
 $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT)
        $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@
index 1f18a40..645530c 100644 (file)
@@ -11,9 +11,9 @@ const volatile __u64 min_us = 0;
 const volatile pid_t targ_pid = 0;
 
 struct {
-       __uint(type, BPF_MAP_TYPE_HASH);
-       __uint(max_entries, 10240);
-       __type(key, u32);
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
        __type(value, u64);
 } start SEC(".maps");
 
@@ -25,15 +25,20 @@ struct {
 
 /* record enqueue timestamp */
 __always_inline
-static int trace_enqueue(u32 tgid, u32 pid)
+static int trace_enqueue(struct task_struct *t)
 {
-       u64 ts;
+       u32 pid = t->pid;
+       u64 *ptr;
 
        if (!pid || (targ_pid && targ_pid != pid))
                return 0;
 
-       ts = bpf_ktime_get_ns();
-       bpf_map_update_elem(&start, &pid, &ts, 0);
+       ptr = bpf_task_storage_get(&start, t, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (!ptr)
+               return 0;
+
+       *ptr = bpf_ktime_get_ns();
        return 0;
 }
 
@@ -43,7 +48,7 @@ int handle__sched_wakeup(u64 *ctx)
        /* TP_PROTO(struct task_struct *p) */
        struct task_struct *p = (void *)ctx[0];
 
-       return trace_enqueue(p->tgid, p->pid);
+       return trace_enqueue(p);
 }
 
 SEC("tp_btf/sched_wakeup_new")
@@ -52,7 +57,7 @@ int handle__sched_wakeup_new(u64 *ctx)
        /* TP_PROTO(struct task_struct *p) */
        struct task_struct *p = (void *)ctx[0];
 
-       return trace_enqueue(p->tgid, p->pid);
+       return trace_enqueue(p);
 }
 
 SEC("tp_btf/sched_switch")
@@ -70,12 +75,16 @@ int handle__sched_switch(u64 *ctx)
 
        /* ivcsw: treat like an enqueue event and store timestamp */
        if (prev->state == TASK_RUNNING)
-               trace_enqueue(prev->tgid, prev->pid);
+               trace_enqueue(prev);
 
        pid = next->pid;
 
+       /* For pid mismatch, save a bpf_task_storage_get */
+       if (!pid || (targ_pid && targ_pid != pid))
+               return 0;
+
        /* fetch timestamp and calculate delta */
-       tsp = bpf_map_lookup_elem(&start, &pid);
+       tsp = bpf_task_storage_get(&start, next, 0, 0);
        if (!tsp)
                return 0;   /* missed enqueue */
 
@@ -91,7 +100,7 @@ int handle__sched_switch(u64 *ctx)
        bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
                              &event, sizeof(event));
 
-       bpf_map_delete_elem(&start, &pid);
+       bpf_task_storage_delete(&start, next);
        return 0;
 }
 
index 79c8933..2d3036e 100644 (file)
@@ -93,7 +93,717 @@ union bpf_iter_link_info {
        } map;
 };
 
-/* BPF syscall commands, see bpf(2) man-page for details. */
+/* BPF syscall commands, see bpf(2) man-page for more details. */
+/**
+ * DOC: eBPF Syscall Preamble
+ *
+ * The operation to be performed by the **bpf**\ () system call is determined
+ * by the *cmd* argument. Each operation takes an accompanying argument,
+ * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
+ * below). The size argument is the size of the union pointed to by *attr*.
+ */
+/**
+ * DOC: eBPF Syscall Commands
+ *
+ * BPF_MAP_CREATE
+ *     Description
+ *             Create a map and return a file descriptor that refers to the
+ *             map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
+ *             is automatically enabled for the new file descriptor.
+ *
+ *             Applying **close**\ (2) to the file descriptor returned by
+ *             **BPF_MAP_CREATE** will delete the map (but see NOTES).
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_LOOKUP_ELEM
+ *     Description
+ *             Look up an element with a given *key* in the map referred to
+ *             by the file descriptor *map_fd*.
+ *
+ *             The *flags* argument may be specified as one of the
+ *             following:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up the value of a spin-locked map without
+ *                     returning the lock. This must be specified if the
+ *                     elements contain a spinlock.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_UPDATE_ELEM
+ *     Description
+ *             Create or update an element (key/value pair) in a specified map.
+ *
+ *             The *flags* argument should be specified as one of the
+ *             following:
+ *
+ *             **BPF_ANY**
+ *                     Create a new element or update an existing element.
+ *             **BPF_NOEXIST**
+ *                     Create a new element only if it did not exist.
+ *             **BPF_EXIST**
+ *                     Update an existing element.
+ *             **BPF_F_LOCK**
+ *                     Update a spin_lock-ed map element.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
+ *             **E2BIG**, **EEXIST**, or **ENOENT**.
+ *
+ *             **E2BIG**
+ *                     The number of elements in the map reached the
+ *                     *max_entries* limit specified at map creation time.
+ *             **EEXIST**
+ *                     If *flags* specifies **BPF_NOEXIST** and the element
+ *                     with *key* already exists in the map.
+ *             **ENOENT**
+ *                     If *flags* specifies **BPF_EXIST** and the element with
+ *                     *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_ELEM
+ *     Description
+ *             Look up and delete an element by key in a specified map.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_KEY
+ *     Description
+ *             Look up an element by key in a specified map and return the key
+ *             of the next element. Can be used to iterate over all elements
+ *             in the map.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             The following cases can be used to iterate over all elements of
+ *             the map:
+ *
+ *             * If *key* is not found, the operation returns zero and sets
+ *               the *next_key* pointer to the key of the first element.
+ *             * If *key* is found, the operation returns zero and sets the
+ *               *next_key* pointer to the key of the next element.
+ *             * If *key* is the last element, returns -1 and *errno* is set
+ *               to **ENOENT**.
+ *
+ *             May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
+ *             **EINVAL** on error.
+ *
+ * BPF_PROG_LOAD
+ *     Description
+ *             Verify and load an eBPF program, returning a new file
+ *             descriptor associated with the program.
+ *
+ *             Applying **close**\ (2) to the file descriptor returned by
+ *             **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
+ *
+ *             The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
+ *             automatically enabled for the new file descriptor.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_PIN
+ *     Description
+ *             Pin an eBPF program or map referred by the specified *bpf_fd*
+ *             to the provided *pathname* on the filesystem.
+ *
+ *             The *pathname* argument must not contain a dot (".").
+ *
+ *             On success, *pathname* retains a reference to the eBPF object,
+ *             preventing deallocation of the object when the original
+ *             *bpf_fd* is closed. This allow the eBPF object to live beyond
+ *             **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
+ *             process.
+ *
+ *             Applying **unlink**\ (2) or similar calls to the *pathname*
+ *             unpins the object from the filesystem, removing the reference.
+ *             If no other file descriptors or filesystem nodes refer to the
+ *             same object, it will be deallocated (see NOTES).
+ *
+ *             The filesystem type for the parent directory of *pathname* must
+ *             be **BPF_FS_MAGIC**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_OBJ_GET
+ *     Description
+ *             Open a file descriptor for the eBPF object pinned to the
+ *             specified *pathname*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_PROG_ATTACH
+ *     Description
+ *             Attach an eBPF program to a *target_fd* at the specified
+ *             *attach_type* hook.
+ *
+ *             The *attach_type* specifies the eBPF attachment point to
+ *             attach the program to, and must be one of *bpf_attach_type*
+ *             (see below).
+ *
+ *             The *attach_bpf_fd* must be a valid file descriptor for a
+ *             loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
+ *             or sock_ops type corresponding to the specified *attach_type*.
+ *
+ *             The *target_fd* must be a valid file descriptor for a kernel
+ *             object which depends on the attach type of *attach_bpf_fd*:
+ *
+ *             **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ *             **BPF_PROG_TYPE_CGROUP_SKB**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ *             **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ *             **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ *                     Control Group v2 hierarchy with the eBPF controller
+ *                     enabled. Requires the kernel to be compiled with
+ *                     **CONFIG_CGROUP_BPF**.
+ *
+ *             **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ *                     Network namespace (eg /proc/self/ns/net).
+ *
+ *             **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ *                     LIRC device path (eg /dev/lircN). Requires the kernel
+ *                     to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ *             **BPF_PROG_TYPE_SK_SKB**,
+ *             **BPF_PROG_TYPE_SK_MSG**
+ *
+ *                     eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_DETACH
+ *     Description
+ *             Detach the eBPF program associated with the *target_fd* at the
+ *             hook specified by *attach_type*. The program must have been
+ *             previously attached using **BPF_PROG_ATTACH**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_TEST_RUN
+ *     Description
+ *             Run the eBPF program associated with the *prog_fd* a *repeat*
+ *             number of times against a provided program context *ctx_in* and
+ *             data *data_in*, and return the modified program context
+ *             *ctx_out*, *data_out* (for example, packet data), result of the
+ *             execution *retval*, and *duration* of the test run.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             **ENOSPC**
+ *                     Either *data_size_out* or *ctx_size_out* is too small.
+ *             **ENOTSUPP**
+ *                     This command is not supported by the program type of
+ *                     the program referred to by *prog_fd*.
+ *
+ * BPF_PROG_GET_NEXT_ID
+ *     Description
+ *             Fetch the next eBPF program currently loaded into the kernel.
+ *
+ *             Looks for the eBPF program with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other eBPF programs
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_ID
+ *     Description
+ *             Fetch the next eBPF map currently loaded into the kernel.
+ *
+ *             Looks for the eBPF map with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other eBPF maps
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_PROG_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the eBPF program corresponding to
+ *             *prog_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the eBPF map corresponding to
+ *             *map_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_GET_INFO_BY_FD
+ *     Description
+ *             Obtain information about the eBPF object corresponding to
+ *             *bpf_fd*.
+ *
+ *             Populates up to *info_len* bytes of *info*, which will be in
+ *             one of the following formats depending on the eBPF object type
+ *             of *bpf_fd*:
+ *
+ *             * **struct bpf_prog_info**
+ *             * **struct bpf_map_info**
+ *             * **struct bpf_btf_info**
+ *             * **struct bpf_link_info**
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_QUERY
+ *     Description
+ *             Obtain information about eBPF programs associated with the
+ *             specified *attach_type* hook.
+ *
+ *             The *target_fd* must be a valid file descriptor for a kernel
+ *             object which depends on the attach type of *attach_bpf_fd*:
+ *
+ *             **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ *             **BPF_PROG_TYPE_CGROUP_SKB**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ *             **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ *             **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ *             **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ *                     Control Group v2 hierarchy with the eBPF controller
+ *                     enabled. Requires the kernel to be compiled with
+ *                     **CONFIG_CGROUP_BPF**.
+ *
+ *             **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ *                     Network namespace (eg /proc/self/ns/net).
+ *
+ *             **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ *                     LIRC device path (eg /dev/lircN). Requires the kernel
+ *                     to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ *             **BPF_PROG_QUERY** always fetches the number of programs
+ *             attached and the *attach_flags* which were used to attach those
+ *             programs. Additionally, if *prog_ids* is nonzero and the number
+ *             of attached programs is less than *prog_cnt*, populates
+ *             *prog_ids* with the eBPF program ids of the programs attached
+ *             at *target_fd*.
+ *
+ *             The following flags may alter the result:
+ *
+ *             **BPF_F_QUERY_EFFECTIVE**
+ *                     Only return information regarding programs which are
+ *                     currently effective at the specified *target_fd*.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_RAW_TRACEPOINT_OPEN
+ *     Description
+ *             Attach an eBPF program to a tracepoint *name* to access kernel
+ *             internal arguments of the tracepoint in their raw form.
+ *
+ *             The *prog_fd* must be a valid file descriptor associated with
+ *             a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+ *
+ *             No ABI guarantees are made about the content of tracepoint
+ *             arguments exposed to the corresponding eBPF program.
+ *
+ *             Applying **close**\ (2) to the file descriptor returned by
+ *             **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_LOAD
+ *     Description
+ *             Verify and load BPF Type Format (BTF) metadata into the kernel,
+ *             returning a new file descriptor associated with the metadata.
+ *             BTF is described in more detail at
+ *             https://www.kernel.org/doc/html/latest/bpf/btf.html.
+ *
+ *             The *btf* parameter must point to valid memory providing
+ *             *btf_size* bytes of BTF binary metadata.
+ *
+ *             The returned file descriptor can be passed to other **bpf**\ ()
+ *             subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
+ *             associate the BTF with those objects.
+ *
+ *             Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
+ *             parameters to specify a *btf_log_buf*, *btf_log_size* and
+ *             *btf_log_level* which allow the kernel to return freeform log
+ *             output regarding the BTF verification process.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the BPF Type Format (BTF)
+ *             corresponding to *btf_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_TASK_FD_QUERY
+ *     Description
+ *             Obtain information about eBPF programs associated with the
+ *             target process identified by *pid* and *fd*.
+ *
+ *             If the *pid* and *fd* are associated with a tracepoint, kprobe
+ *             or uprobe perf event, then the *prog_id* and *fd_type* will
+ *             be populated with the eBPF program id and file descriptor type
+ *             of type **bpf_task_fd_type**. If associated with a kprobe or
+ *             uprobe, the  *probe_offset* and *probe_addr* will also be
+ *             populated. Optionally, if *buf* is provided, then up to
+ *             *buf_len* bytes of *buf* will be populated with the name of
+ *             the tracepoint, kprobe or uprobe.
+ *
+ *             The resulting *prog_id* may be introspected in deeper detail
+ *             using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_ELEM
+ *     Description
+ *             Look up an element with the given *key* in the map referred to
+ *             by the file descriptor *fd*, and if found, delete the element.
+ *
+ *             The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
+ *             implement this command as a "pop" operation, deleting the top
+ *             element rather than one corresponding to *key*.
+ *             The *key* and *key_len* parameters should be zeroed when
+ *             issuing this operation for these map types.
+ *
+ *             This command is only valid for the following map types:
+ *             * **BPF_MAP_TYPE_QUEUE**
+ *             * **BPF_MAP_TYPE_STACK**
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_FREEZE
+ *     Description
+ *             Freeze the permissions of the specified map.
+ *
+ *             Write permissions may be frozen by passing zero *flags*.
+ *             Upon success, no future syscall invocations may alter the
+ *             map state of *map_fd*. Write operations from eBPF programs
+ *             are still possible for a frozen map.
+ *
+ *             Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_BTF_GET_NEXT_ID
+ *     Description
+ *             Fetch the next BPF Type Format (BTF) object currently loaded
+ *             into the kernel.
+ *
+ *             Looks for the BTF object with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other BTF objects
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_BATCH
+ *     Description
+ *             Iterate and fetch multiple elements in a map.
+ *
+ *             Two opaque values are used to manage batch operations,
+ *             *in_batch* and *out_batch*. Initially, *in_batch* must be set
+ *             to NULL to begin the batched operation. After each subsequent
+ *             **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
+ *             *out_batch* as the *in_batch* for the next operation to
+ *             continue iteration from the current point.
+ *
+ *             The *keys* and *values* are output parameters which must point
+ *             to memory large enough to hold *count* items based on the key
+ *             and value size of the map *map_fd*. The *keys* buffer must be
+ *             of *key_size* * *count*. The *values* buffer must be of
+ *             *value_size* * *count*.
+ *
+ *             The *elem_flags* argument may be specified as one of the
+ *             following:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up the value of a spin-locked map without
+ *                     returning the lock. This must be specified if the
+ *                     elements contain a spinlock.
+ *
+ *             On success, *count* elements from the map are copied into the
+ *             user buffer, with the keys copied into *keys* and the values
+ *             copied into the corresponding indices in *values*.
+ *
+ *             If an error is returned and *errno* is not **EFAULT**, *count*
+ *             is set to the number of successfully processed elements.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             May set *errno* to **ENOSPC** to indicate that *keys* or
+ *             *values* is too small to dump an entire bucket during
+ *             iteration of a hash-based map type.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_BATCH
+ *     Description
+ *             Iterate and delete all elements in a map.
+ *
+ *             This operation has the same behavior as
+ *             **BPF_MAP_LOOKUP_BATCH** with two exceptions:
+ *
+ *             * Every element that is successfully returned is also deleted
+ *               from the map. This is at least *count* elements. Note that
+ *               *count* is both an input and an output parameter.
+ *             * Upon returning with *errno* set to **EFAULT**, up to
+ *               *count* elements may be deleted without returning the keys
+ *               and values of the deleted elements.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_MAP_UPDATE_BATCH
+ *     Description
+ *             Update multiple elements in a map by *key*.
+ *
+ *             The *keys* and *values* are input parameters which must point
+ *             to memory large enough to hold *count* items based on the key
+ *             and value size of the map *map_fd*. The *keys* buffer must be
+ *             of *key_size* * *count*. The *values* buffer must be of
+ *             *value_size* * *count*.
+ *
+ *             Each element specified in *keys* is sequentially updated to the
+ *             value in the corresponding index in *values*. The *in_batch*
+ *             and *out_batch* parameters are ignored and should be zeroed.
+ *
+ *             The *elem_flags* argument should be specified as one of the
+ *             following:
+ *
+ *             **BPF_ANY**
+ *                     Create new elements or update a existing elements.
+ *             **BPF_NOEXIST**
+ *                     Create new elements only if they do not exist.
+ *             **BPF_EXIST**
+ *                     Update existing elements.
+ *             **BPF_F_LOCK**
+ *                     Update spin_lock-ed map elements. This must be
+ *                     specified if the map value contains a spinlock.
+ *
+ *             On success, *count* elements from the map are updated.
+ *
+ *             If an error is returned and *errno* is not **EFAULT**, *count*
+ *             is set to the number of successfully processed elements.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ *             May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
+ *             **E2BIG**. **E2BIG** indicates that the number of elements in
+ *             the map reached the *max_entries* limit specified at map
+ *             creation time.
+ *
+ *             May set *errno* to one of the following error codes under
+ *             specific circumstances:
+ *
+ *             **EEXIST**
+ *                     If *flags* specifies **BPF_NOEXIST** and the element
+ *                     with *key* already exists in the map.
+ *             **ENOENT**
+ *                     If *flags* specifies **BPF_EXIST** and the element with
+ *                     *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_BATCH
+ *     Description
+ *             Delete multiple elements in a map by *key*.
+ *
+ *             The *keys* parameter is an input parameter which must point
+ *             to memory large enough to hold *count* items based on the key
+ *             size of the map *map_fd*, that is, *key_size* * *count*.
+ *
+ *             Each element specified in *keys* is sequentially deleted. The
+ *             *in_batch*, *out_batch*, and *values* parameters are ignored
+ *             and should be zeroed.
+ *
+ *             The *elem_flags* argument may be specified as one of the
+ *             following:
+ *
+ *             **BPF_F_LOCK**
+ *                     Look up the value of a spin-locked map without
+ *                     returning the lock. This must be specified if the
+ *                     elements contain a spinlock.
+ *
+ *             On success, *count* elements from the map are updated.
+ *
+ *             If an error is returned and *errno* is not **EFAULT**, *count*
+ *             is set to the number of successfully processed elements. If
+ *             *errno* is **EFAULT**, up to *count* elements may be been
+ *             deleted.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_LINK_CREATE
+ *     Description
+ *             Attach an eBPF program to a *target_fd* at the specified
+ *             *attach_type* hook and return a file descriptor handle for
+ *             managing the link.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_UPDATE
+ *     Description
+ *             Update the eBPF program in the specified *link_fd* to
+ *             *new_prog_fd*.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_LINK_GET_FD_BY_ID
+ *     Description
+ *             Open a file descriptor for the eBPF Link corresponding to
+ *             *link_id*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_GET_NEXT_ID
+ *     Description
+ *             Fetch the next eBPF link currently loaded into the kernel.
+ *
+ *             Looks for the eBPF link with an id greater than *start_id*
+ *             and updates *next_id* on success. If no other eBPF links
+ *             remain with ids higher than *start_id*, returns -1 and sets
+ *             *errno* to **ENOENT**.
+ *
+ *     Return
+ *             Returns zero on success. On error, or when no id remains, -1
+ *             is returned and *errno* is set appropriately.
+ *
+ * BPF_ENABLE_STATS
+ *     Description
+ *             Enable eBPF runtime statistics gathering.
+ *
+ *             Runtime statistics gathering for the eBPF runtime is disabled
+ *             by default to minimize the corresponding performance overhead.
+ *             This command enables statistics globally.
+ *
+ *             Multiple programs may independently enable statistics.
+ *             After gathering the desired statistics, eBPF runtime statistics
+ *             may be disabled again by calling **close**\ (2) for the file
+ *             descriptor returned by this function. Statistics will only be
+ *             disabled system-wide when all outstanding file descriptors
+ *             returned by prior calls for this subcommand are closed.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_ITER_CREATE
+ *     Description
+ *             Create an iterator on top of the specified *link_fd* (as
+ *             previously created using **BPF_LINK_CREATE**) and return a
+ *             file descriptor that can be used to trigger the iteration.
+ *
+ *             If the resulting file descriptor is pinned to the filesystem
+ *             using  **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
+ *             for that path will trigger the iterator to read kernel state
+ *             using the eBPF program attached to *link_fd*.
+ *
+ *     Return
+ *             A new file descriptor (a nonnegative integer), or -1 if an
+ *             error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_DETACH
+ *     Description
+ *             Forcefully detach the specified *link_fd* from its
+ *             corresponding attachment point.
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * BPF_PROG_BIND_MAP
+ *     Description
+ *             Bind a map to the lifetime of an eBPF program.
+ *
+ *             The map identified by *map_fd* is bound to the program
+ *             identified by *prog_fd* and only released when *prog_fd* is
+ *             released. This may be used in cases where metadata should be
+ *             associated with a program which otherwise does not contain any
+ *             references to the map (for example, embedded in the eBPF
+ *             program instructions).
+ *
+ *     Return
+ *             Returns zero on success. On error, -1 is returned and *errno*
+ *             is set appropriately.
+ *
+ * NOTES
+ *     eBPF objects (maps and programs) can be shared between processes.
+ *
+ *     * After **fork**\ (2), the child inherits file descriptors
+ *       referring to the same eBPF objects.
+ *     * File descriptors referring to eBPF objects can be transferred over
+ *       **unix**\ (7) domain sockets.
+ *     * File descriptors referring to eBPF objects can be duplicated in the
+ *       usual way, using **dup**\ (2) and similar calls.
+ *     * File descriptors referring to eBPF objects can be pinned to the
+ *       filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
+ *
+ *     An eBPF object is deallocated only after all file descriptors referring
+ *     to the object have been closed and no references remain pinned to the
+ *     filesystem or attached (for example, bound to a program or device).
+ */
 enum bpf_cmd {
        BPF_MAP_CREATE,
        BPF_MAP_LOOKUP_ELEM,
@@ -393,6 +1103,15 @@ enum bpf_link_type {
  *                   is struct/union.
  */
 #define BPF_PSEUDO_BTF_ID      3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC                4
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -720,7 +1439,7 @@ union bpf_attr {
  * parsed and used to produce a manual page. The workflow is the following,
  * and requires the rst2man utility:
  *
- *     $ ./scripts/bpf_helpers_doc.py \
+ *     $ ./scripts/bpf_doc.py \
  *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
  *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
  *     $ man /tmp/bpf-helpers.7
@@ -1765,6 +2484,10 @@ union bpf_attr {
  *               Use with ENCAP_L3/L4 flags to further specify the tunnel
  *               type; *len* is the length of the inner MAC header.
  *
+ *             * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ *               Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ *               L2 type as Ethernet.
+ *
  *             A call to this helper is susceptible to change the underlying
  *             packet buffer. Therefore, at load time, all checks on pointers
  *             previously done by the verifier are invalidated and must be
@@ -3909,6 +4632,34 @@ union bpf_attr {
  *             * **BPF_MTU_CHK_RET_FRAG_NEEDED**
  *             * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
  *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *     Description
+ *             For each element in **map**, call **callback_fn** function with
+ *             **map**, **callback_ctx** and other map-specific parameters.
+ *             The **callback_fn** should be a static function and
+ *             the **callback_ctx** should be a pointer to the stack.
+ *             The **flags** is used to control certain aspects of the helper.
+ *             Currently, the **flags** must be 0.
+ *
+ *             The following are a list of supported map types and their
+ *             respective expected callback signatures:
+ *
+ *             BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *             BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *             BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *             long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *             For per_cpu maps, the map_value is the value on the cpu where the
+ *             bpf_prog is running.
+ *
+ *             If **callback_fn** return 0, the helper will continue to the next
+ *             element. If return value is 1, the helper will skip the rest of
+ *             elements and return. Other return values are not used now.
+ *
+ *     Return
+ *             The number of traversed map elements for success, **-EINVAL** for
+ *             invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -4075,6 +4826,7 @@ union bpf_attr {
        FN(ima_inode_hash),             \
        FN(sock_from_file),             \
        FN(check_mtu),                  \
+       FN(for_each_map_elem),          \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -4168,6 +4920,7 @@ enum {
        BPF_F_ADJ_ROOM_ENCAP_L4_GRE     = (1ULL << 3),
        BPF_F_ADJ_ROOM_ENCAP_L4_UDP     = (1ULL << 4),
        BPF_F_ADJ_ROOM_NO_CSUM_RESET    = (1ULL << 5),
+       BPF_F_ADJ_ROOM_ENCAP_L2_ETH     = (1ULL << 6),
 };
 
 enum {
@@ -5205,7 +5958,10 @@ struct bpf_pidns_info {
 
 /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
 struct bpf_sk_lookup {
-       __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+       union {
+               __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+               __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+       };
 
        __u32 family;           /* Protocol family (AF_INET, AF_INET6) */
        __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
index 5a66710..d27b170 100644 (file)
@@ -52,7 +52,7 @@ struct btf_type {
        };
 };
 
-#define BTF_INFO_KIND(info)    (((info) >> 24) & 0x0f)
+#define BTF_INFO_KIND(info)    (((info) >> 24) & 0x1f)
 #define BTF_INFO_VLEN(info)    ((info) & 0xffff)
 #define BTF_INFO_KFLAG(info)   ((info) >> 31)
 
@@ -72,7 +72,8 @@ struct btf_type {
 #define BTF_KIND_FUNC_PROTO    13      /* Function Proto       */
 #define BTF_KIND_VAR           14      /* Variable     */
 #define BTF_KIND_DATASEC       15      /* Section      */
-#define BTF_KIND_MAX           BTF_KIND_DATASEC
+#define BTF_KIND_FLOAT         16      /* Floating point       */
+#define BTF_KIND_MAX           BTF_KIND_FLOAT
 #define NR_BTF_KINDS           (BTF_KIND_MAX + 1)
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
index 887a494..8170f88 100644 (file)
@@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS)
        $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
 
 $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
-       $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \
+       $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \
                --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS)
 
 $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
index d9c1083..3aa58f2 100644 (file)
@@ -291,6 +291,7 @@ static int btf_type_size(const struct btf_type *t)
        case BTF_KIND_PTR:
        case BTF_KIND_TYPEDEF:
        case BTF_KIND_FUNC:
+       case BTF_KIND_FLOAT:
                return base_size;
        case BTF_KIND_INT:
                return base_size + sizeof(__u32);
@@ -338,6 +339,7 @@ static int btf_bswap_type_rest(struct btf_type *t)
        case BTF_KIND_PTR:
        case BTF_KIND_TYPEDEF:
        case BTF_KIND_FUNC:
+       case BTF_KIND_FLOAT:
                return 0;
        case BTF_KIND_INT:
                *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1));
@@ -578,6 +580,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
                case BTF_KIND_UNION:
                case BTF_KIND_ENUM:
                case BTF_KIND_DATASEC:
+               case BTF_KIND_FLOAT:
                        size = t->size;
                        goto done;
                case BTF_KIND_PTR:
@@ -621,6 +624,7 @@ int btf__align_of(const struct btf *btf, __u32 id)
        switch (kind) {
        case BTF_KIND_INT:
        case BTF_KIND_ENUM:
+       case BTF_KIND_FLOAT:
                return min(btf_ptr_sz(btf), (size_t)t->size);
        case BTF_KIND_PTR:
                return btf_ptr_sz(btf);
@@ -1756,6 +1760,47 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding
        return btf_commit_type(btf, sz);
 }
 
+/*
+ * Append new BTF_KIND_FLOAT type with:
+ *   - *name* - non-empty, non-NULL type name;
+ *   - *sz* - size of the type, in bytes;
+ * Returns:
+ *   - >0, type ID of newly added BTF type;
+ *   - <0, on error.
+ */
+int btf__add_float(struct btf *btf, const char *name, size_t byte_sz)
+{
+       struct btf_type *t;
+       int sz, name_off;
+
+       /* non-empty name */
+       if (!name || !name[0])
+               return -EINVAL;
+
+       /* byte_sz must be one of the explicitly allowed values */
+       if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 &&
+           byte_sz != 16)
+               return -EINVAL;
+
+       if (btf_ensure_modifiable(btf))
+               return -ENOMEM;
+
+       sz = sizeof(struct btf_type);
+       t = btf_add_type_mem(btf, sz);
+       if (!t)
+               return -ENOMEM;
+
+       name_off = btf__add_str(btf, name);
+       if (name_off < 0)
+               return name_off;
+
+       t->name_off = name_off;
+       t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0);
+       t->size = byte_sz;
+
+       return btf_commit_type(btf, sz);
+}
+
 /* it's completely legal to append BTF types with type IDs pointing forward to
  * types that haven't been appended yet, so we only make sure that id looks
  * sane, we can't guarantee that ID will always be valid
@@ -1883,7 +1928,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32
  *   - *byte_sz* - size of the struct, in bytes;
  *
  * Struct initially has no fields in it. Fields can be added by
- * btf__add_field() right after btf__add_struct() succeeds. 
+ * btf__add_field() right after btf__add_struct() succeeds.
  *
  * Returns:
  *   - >0, type ID of newly added BTF type;
@@ -3626,6 +3671,7 @@ static int btf_dedup_prep(struct btf_dedup *d)
                case BTF_KIND_FWD:
                case BTF_KIND_TYPEDEF:
                case BTF_KIND_FUNC:
+               case BTF_KIND_FLOAT:
                        h = btf_hash_common(t);
                        break;
                case BTF_KIND_INT:
@@ -3722,6 +3768,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
                break;
 
        case BTF_KIND_FWD:
+       case BTF_KIND_FLOAT:
                h = btf_hash_common(t);
                for_each_dedup_cand(d, hash_entry, h) {
                        cand_id = (__u32)(long)hash_entry->value;
@@ -3983,6 +4030,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
                        return btf_compat_enum(cand_type, canon_type);
 
        case BTF_KIND_FWD:
+       case BTF_KIND_FLOAT:
                return btf_equal_common(cand_type, canon_type);
 
        case BTF_KIND_CONST:
@@ -4479,6 +4527,7 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
        switch (btf_kind(t)) {
        case BTF_KIND_INT:
        case BTF_KIND_ENUM:
+       case BTF_KIND_FLOAT:
                break;
 
        case BTF_KIND_FWD:
index 1237bcd..029a9cf 100644 (file)
@@ -95,6 +95,7 @@ LIBBPF_API int btf__find_str(struct btf *btf, const char *s);
 LIBBPF_API int btf__add_str(struct btf *btf, const char *s);
 
 LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding);
+LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz);
 LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id);
 LIBBPF_API int btf__add_array(struct btf *btf,
                              int index_type_id, int elem_type_id, __u32 nr_elems);
@@ -294,6 +295,11 @@ static inline bool btf_is_datasec(const struct btf_type *t)
        return btf_kind(t) == BTF_KIND_DATASEC;
 }
 
+static inline bool btf_is_float(const struct btf_type *t)
+{
+       return btf_kind(t) == BTF_KIND_FLOAT;
+}
+
 static inline __u8 btf_int_encoding(const struct btf_type *t)
 {
        return BTF_INT_ENCODING(*(__u32 *)(t + 1));
index 2f9d685..5e957fc 100644 (file)
@@ -279,6 +279,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d)
                case BTF_KIND_INT:
                case BTF_KIND_ENUM:
                case BTF_KIND_FWD:
+               case BTF_KIND_FLOAT:
                        break;
 
                case BTF_KIND_VOLATILE:
@@ -453,6 +454,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 
        switch (btf_kind(t)) {
        case BTF_KIND_INT:
+       case BTF_KIND_FLOAT:
                tstate->order_state = ORDERED;
                return 0;
 
@@ -1133,6 +1135,7 @@ skip_mod:
                case BTF_KIND_STRUCT:
                case BTF_KIND_UNION:
                case BTF_KIND_TYPEDEF:
+               case BTF_KIND_FLOAT:
                        goto done;
                default:
                        pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n",
@@ -1247,6 +1250,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 
                switch (kind) {
                case BTF_KIND_INT:
+               case BTF_KIND_FLOAT:
                        btf_dump_emit_mods(d, decls);
                        name = btf_name_of(d, t->name_off);
                        btf_dump_printf(d, "%s", name);
index d43cc3f..2f351d3 100644 (file)
@@ -178,6 +178,8 @@ enum kern_feature_id {
        FEAT_PROG_BIND_MAP,
        /* Kernel support for module BTFs */
        FEAT_MODULE_BTF,
+       /* BTF_KIND_FLOAT support */
+       FEAT_BTF_FLOAT,
        __FEAT_CNT,
 };
 
@@ -188,6 +190,7 @@ enum reloc_type {
        RELO_CALL,
        RELO_DATA,
        RELO_EXTERN,
+       RELO_SUBPROG_ADDR,
 };
 
 struct reloc_desc {
@@ -574,6 +577,16 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn)
               insn->off == 0;
 }
 
+static bool is_ldimm64(struct bpf_insn *insn)
+{
+       return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
+static bool insn_is_pseudo_func(struct bpf_insn *insn)
+{
+       return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
 static int
 bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog,
                      const char *name, size_t sec_idx, const char *sec_name,
@@ -1935,6 +1948,7 @@ static const char *btf_kind_str(const struct btf_type *t)
        case BTF_KIND_FUNC_PROTO: return "func_proto";
        case BTF_KIND_VAR: return "var";
        case BTF_KIND_DATASEC: return "datasec";
+       case BTF_KIND_FLOAT: return "float";
        default: return "unknown";
        }
 }
@@ -2384,15 +2398,17 @@ static bool btf_needs_sanitization(struct bpf_object *obj)
 {
        bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
        bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
+       bool has_float = kernel_supports(FEAT_BTF_FLOAT);
        bool has_func = kernel_supports(FEAT_BTF_FUNC);
 
-       return !has_func || !has_datasec || !has_func_global;
+       return !has_func || !has_datasec || !has_func_global || !has_float;
 }
 
 static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
 {
        bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
        bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
+       bool has_float = kernel_supports(FEAT_BTF_FLOAT);
        bool has_func = kernel_supports(FEAT_BTF_FUNC);
        struct btf_type *t;
        int i, j, vlen;
@@ -2445,6 +2461,13 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
                } else if (!has_func_global && btf_is_func(t)) {
                        /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */
                        t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0);
+               } else if (!has_float && btf_is_float(t)) {
+                       /* replace FLOAT with an equally-sized empty STRUCT;
+                        * since C compilers do not accept e.g. "float" as a
+                        * valid struct name, make it anonymous
+                        */
+                       t->name_off = 0;
+                       t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0);
                }
        }
 }
@@ -2974,6 +2997,23 @@ static bool sym_is_extern(const GElf_Sym *sym)
               GELF_ST_TYPE(sym->st_info) == STT_NOTYPE;
 }
 
+static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx)
+{
+       int bind = GELF_ST_BIND(sym->st_info);
+       int type = GELF_ST_TYPE(sym->st_info);
+
+       /* in .text section */
+       if (sym->st_shndx != text_shndx)
+               return false;
+
+       /* local function */
+       if (bind == STB_LOCAL && type == STT_SECTION)
+               return true;
+
+       /* global function */
+       return bind == STB_GLOBAL && type == STT_FUNC;
+}
+
 static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
 {
        const struct btf_type *t;
@@ -3395,7 +3435,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
                return 0;
        }
 
-       if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) {
+       if (!is_ldimm64(insn)) {
                pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n",
                        prog->name, sym_name, insn_idx, insn->code);
                return -LIBBPF_ERRNO__RELOC;
@@ -3430,6 +3470,23 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
                return -LIBBPF_ERRNO__RELOC;
        }
 
+       /* loading subprog addresses */
+       if (sym_is_subprog(sym, obj->efile.text_shndx)) {
+               /* global_func: sym->st_value = offset in the section, insn->imm = 0.
+                * local_func: sym->st_value = 0, insn->imm = offset in the section.
+                */
+               if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) {
+                       pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n",
+                               prog->name, sym_name, (size_t)sym->st_value, insn->imm);
+                       return -LIBBPF_ERRNO__RELOC;
+               }
+
+               reloc_desc->type = RELO_SUBPROG_ADDR;
+               reloc_desc->insn_idx = insn_idx;
+               reloc_desc->sym_off = sym->st_value;
+               return 0;
+       }
+
        type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
        sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
 
@@ -3882,6 +3939,18 @@ static int probe_kern_btf_datasec(void)
                                             strs, sizeof(strs)));
 }
 
+static int probe_kern_btf_float(void)
+{
+       static const char strs[] = "\0float";
+       __u32 types[] = {
+               /* float */
+               BTF_TYPE_FLOAT_ENC(1, 4),
+       };
+
+       return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+                                            strs, sizeof(strs)));
+}
+
 static int probe_kern_array_mmap(void)
 {
        struct bpf_create_map_attr attr = {
@@ -4061,6 +4130,9 @@ static struct kern_feature_desc {
        [FEAT_MODULE_BTF] = {
                "module BTF support", probe_module_btf,
        },
+       [FEAT_BTF_FLOAT] = {
+               "BTF_KIND_FLOAT support", probe_kern_btf_float,
+       },
 };
 
 static bool kernel_supports(enum kern_feature_id feat_id)
@@ -5566,11 +5638,6 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx,
        insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */
 }
 
-static bool is_ldimm64(struct bpf_insn *insn)
-{
-       return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
-}
-
 static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
 {
        switch (BPF_SIZE(insn->code)) {
@@ -6172,6 +6239,10 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
                        }
                        relo->processed = true;
                        break;
+               case RELO_SUBPROG_ADDR:
+                       insn[0].src_reg = BPF_PSEUDO_FUNC;
+                       /* will be handled as a follow up pass */
+                       break;
                case RELO_CALL:
                        /* will be handled as a follow up pass */
                        break;
@@ -6358,11 +6429,11 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog,
 
        for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) {
                insn = &main_prog->insns[prog->sub_insn_off + insn_idx];
-               if (!insn_is_subprog_call(insn))
+               if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn))
                        continue;
 
                relo = find_prog_insn_relo(prog, insn_idx);
-               if (relo && relo->type != RELO_CALL) {
+               if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) {
                        pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n",
                                prog->name, insn_idx, relo->type);
                        return -LIBBPF_ERRNO__RELOC;
@@ -6374,8 +6445,22 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog,
                         * call always has imm = -1, but for static functions
                         * relocation is against STT_SECTION and insn->imm
                         * points to a start of a static function
+                        *
+                        * for subprog addr relocation, the relo->sym_off + insn->imm is
+                        * the byte offset in the corresponding section.
+                        */
+                       if (relo->type == RELO_CALL)
+                               sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
+                       else
+                               sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ;
+               } else if (insn_is_pseudo_func(insn)) {
+                       /*
+                        * RELO_SUBPROG_ADDR relo is always emitted even if both
+                        * functions are in the same section, so it shouldn't reach here.
                         */
-                       sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
+                       pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n",
+                               prog->name, insn_idx);
+                       return -LIBBPF_ERRNO__RELOC;
                } else {
                        /* if subprogram call is to a static function within
                         * the same ELF section, there won't be any relocation
index 1c0fd2d..ec898f4 100644 (file)
@@ -350,3 +350,8 @@ LIBBPF_0.3.0 {
                xsk_setup_xdp_prog;
                xsk_socket__update_xskmap;
 } LIBBPF_0.2.0;
+
+LIBBPF_0.4.0 {
+       global:
+               btf__add_float;
+} LIBBPF_0.3.0;
index 969d0ac..343f6eb 100644 (file)
@@ -31,6 +31,8 @@
 #define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset)
 #define BTF_PARAM_ENC(name, type) (name), (type)
 #define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size)
+#define BTF_TYPE_FLOAT_ENC(name, sz) \
+       BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz)
 
 #ifndef likely
 #define likely(x) __builtin_expect(!!(x), 1)
index 59c779c..cfbcfc0 100644 (file)
@@ -5,6 +5,7 @@
 #define __LIBBPF_LIBBPF_UTIL_H
 
 #include <stdbool.h>
+#include <linux/compiler.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -15,29 +16,56 @@ extern "C" {
  * application that uses libbpf.
  */
 #if defined(__i386__) || defined(__x86_64__)
-# define libbpf_smp_rmb() asm volatile("" : : : "memory")
-# define libbpf_smp_wmb() asm volatile("" : : : "memory")
-# define libbpf_smp_mb() \
-       asm volatile("lock; addl $0,-4(%%rsp)" : : : "memory", "cc")
-/* Hinders stores to be observed before older loads. */
-# define libbpf_smp_rwmb() asm volatile("" : : : "memory")
+# define libbpf_smp_store_release(p, v)                                        \
+       do {                                                            \
+               asm volatile("" : : : "memory");                        \
+               WRITE_ONCE(*p, v);                                      \
+       } while (0)
+# define libbpf_smp_load_acquire(p)                                    \
+       ({                                                              \
+               typeof(*p) ___p1 = READ_ONCE(*p);                       \
+               asm volatile("" : : : "memory");                        \
+               ___p1;                                                  \
+       })
 #elif defined(__aarch64__)
-# define libbpf_smp_rmb() asm volatile("dmb ishld" : : : "memory")
-# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
-# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory")
-# define libbpf_smp_rwmb() libbpf_smp_mb()
-#elif defined(__arm__)
-/* These are only valid for armv7 and above */
-# define libbpf_smp_rmb() asm volatile("dmb ish" : : : "memory")
-# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory")
-# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory")
-# define libbpf_smp_rwmb() libbpf_smp_mb()
-#else
-/* Architecture missing native barrier functions. */
-# define libbpf_smp_rmb() __sync_synchronize()
-# define libbpf_smp_wmb() __sync_synchronize()
-# define libbpf_smp_mb() __sync_synchronize()
-# define libbpf_smp_rwmb() __sync_synchronize()
+# define libbpf_smp_store_release(p, v)                                        \
+               asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory")
+# define libbpf_smp_load_acquire(p)                                    \
+       ({                                                              \
+               typeof(*p) ___p1;                                       \
+               asm volatile ("ldar %w0, %1"                            \
+                             : "=r" (___p1) : "Q" (*p) : "memory");    \
+               ___p1;                                                  \
+       })
+#elif defined(__riscv)
+# define libbpf_smp_store_release(p, v)                                        \
+       do {                                                            \
+               asm volatile ("fence rw,w" : : : "memory");             \
+               WRITE_ONCE(*p, v);                                      \
+       } while (0)
+# define libbpf_smp_load_acquire(p)                                    \
+       ({                                                              \
+               typeof(*p) ___p1 = READ_ONCE(*p);                       \
+               asm volatile ("fence r,rw" : : : "memory");             \
+               ___p1;                                                  \
+       })
+#endif
+
+#ifndef libbpf_smp_store_release
+#define libbpf_smp_store_release(p, v)                                 \
+       do {                                                            \
+               __sync_synchronize();                                   \
+               WRITE_ONCE(*p, v);                                      \
+       } while (0)
+#endif
+
+#ifndef libbpf_smp_load_acquire
+#define libbpf_smp_load_acquire(p)                                     \
+       ({                                                              \
+               typeof(*p) ___p1 = READ_ONCE(*p);                       \
+               __sync_synchronize();                                   \
+               ___p1;                                                  \
+       })
 #endif
 
 #ifdef __cplusplus
index e9f121f..a9fdea8 100644 (file)
@@ -96,7 +96,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
         * this function. Without this optimization it whould have been
         * free_entries = r->cached_prod - r->cached_cons + r->size.
         */
-       r->cached_cons = *r->consumer + r->size;
+       r->cached_cons = libbpf_smp_load_acquire(r->consumer);
+       r->cached_cons += r->size;
 
        return r->cached_cons - r->cached_prod;
 }
@@ -106,7 +107,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
        __u32 entries = r->cached_prod - r->cached_cons;
 
        if (entries == 0) {
-               r->cached_prod = *r->producer;
+               r->cached_prod = libbpf_smp_load_acquire(r->producer);
                entries = r->cached_prod - r->cached_cons;
        }
 
@@ -129,9 +130,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb)
        /* Make sure everything has been written to the ring before indicating
         * this to the kernel by writing the producer pointer.
         */
-       libbpf_smp_wmb();
-
-       *prod->producer += nb;
+       libbpf_smp_store_release(prod->producer, *prod->producer + nb);
 }
 
 static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx)
@@ -139,11 +138,6 @@ static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __
        __u32 entries = xsk_cons_nb_avail(cons, nb);
 
        if (entries > 0) {
-               /* Make sure we do not speculatively read the data before
-                * we have received the packet buffers from the ring.
-                */
-               libbpf_smp_rmb();
-
                *idx = cons->cached_cons;
                cons->cached_cons += entries;
        }
@@ -161,9 +155,8 @@ static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb)
        /* Make sure data has been read before indicating we are done
         * with the entries by updating the consumer pointer.
         */
-       libbpf_smp_rwmb();
+       libbpf_smp_store_release(cons->consumer, *cons->consumer + nb);
 
-       *cons->consumer += nb;
 }
 
 static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
index 5d7b947..f05c4d4 100644 (file)
@@ -20,4 +20,4 @@ tools/lib/bitmap.c
 tools/lib/str_error_r.c
 tools/lib/vsprintf.c
 tools/lib/zalloc.c
-scripts/bpf_helpers_doc.py
+scripts/bpf_doc.py
index c0c48fd..4866f6a 100644 (file)
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
+bpf-helpers*
+bpf-syscall*
 test_verifier
 test_maps
 test_lru_map
index 044bfdc..c399958 100644 (file)
@@ -68,6 +68,7 @@ TEST_PROGS := test_kmod.sh \
        test_bpftool_build.sh \
        test_bpftool.sh \
        test_bpftool_metadata.sh \
+       test_doc_build.sh \
        test_xsk.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
@@ -103,6 +104,7 @@ override define CLEAN
        $(call msg,CLEAN)
        $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN)
        $(Q)$(MAKE) -C bpf_testmod clean
+       $(Q)$(MAKE) docs-clean
 endef
 
 include ../lib.mk
@@ -180,6 +182,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL)
                    cp $(SCRATCH_DIR)/runqslower $@
 
 $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ)
+$(TEST_GEN_FILES): docs
 
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
 $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c
@@ -200,11 +203,16 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
                    CC=$(HOSTCC) LD=$(HOSTLD)                                  \
                    OUTPUT=$(HOST_BUILD_DIR)/bpftool/                          \
                    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install
-       $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation
-       $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras)           \
-                   -C $(BPFTOOLDIR)/Documentation                             \
-                   OUTPUT=$(BUILD_DIR)/bpftool/Documentation/                 \
-                   prefix= DESTDIR=$(SCRATCH_DIR)/ install
+
+docs:
+       $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras)    \
+                   -f Makefile.docs                                    \
+                   prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@
+
+docs-clean:
+       $(Q)$(MAKE) $(submake_extras)                                   \
+                   -f Makefile.docs                                    \
+                   prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@
 
 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)                    \
           ../../../include/uapi/linux/bpf.h                                   \
@@ -382,11 +390,12 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o:                             \
        $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@)
        $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
 
-# only copy extra resources if in flavored build
+# non-flavored in-srctree builds receive special treatment, in particular, we
+# do not need to copy extra resources (see e.g. test_btf_dump_case())
 $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT)
-ifneq ($2,)
+ifneq ($2:$(OUTPUT),:$(shell pwd))
        $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES))
-       $(Q)cp -a $$^ $(TRUNNER_OUTPUT)/
+       $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/
 endif
 
 $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)                      \
@@ -476,3 +485,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)      \
        prog_tests/tests.h map_tests/tests.h verifier/tests.h           \
        feature                                                         \
        $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko)
+
+.PHONY: docs docs-clean
diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs
new file mode 100644 (file)
index 0000000..ccf2600
--- /dev/null
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+include ../../../scripts/Makefile.include
+include ../../../scripts/utilities.mak
+
+INSTALL ?= install
+RM ?= rm -f
+RMDIR ?= rmdir --ignore-fail-on-non-empty
+
+ifeq ($(V),1)
+  Q =
+else
+  Q = @
+endif
+
+prefix ?= /usr/local
+mandir ?= $(prefix)/man
+man2dir = $(mandir)/man2
+man7dir = $(mandir)/man7
+
+SYSCALL_RST = bpf-syscall.rst
+MAN2_RST = $(SYSCALL_RST)
+
+HELPERS_RST = bpf-helpers.rst
+MAN7_RST = $(HELPERS_RST)
+
+_DOC_MAN2 = $(patsubst %.rst,%.2,$(MAN2_RST))
+DOC_MAN2 = $(addprefix $(OUTPUT),$(_DOC_MAN2))
+
+_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST))
+DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7))
+
+DOCTARGETS := helpers syscall
+
+docs: $(DOCTARGETS)
+syscall: man2
+helpers: man7
+man2: $(DOC_MAN2)
+man7: $(DOC_MAN7)
+
+RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
+
+# Configure make rules for the man page bpf-$1.$2.
+# $1 - target for scripts/bpf_doc.py
+# $2 - man page section to generate the troff file
+define DOCS_RULES =
+$(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h
+       $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \
+               --filename $$< > $$@
+
+$(OUTPUT)%.$2: $(OUTPUT)%.rst
+ifndef RST2MAN_DEP
+       $$(error "rst2man not found, but required to generate man pages")
+endif
+       $$(QUIET_GEN)rst2man $$< > $$@
+
+docs-clean-$1:
+       $$(call QUIET_CLEAN, eBPF_$1-manpage)
+       $(Q)$(RM) $$(DOC_MAN$2) $(OUTPUT)bpf-$1.rst
+
+docs-install-$1: docs
+       $$(call QUIET_INSTALL, eBPF_$1-manpage)
+       $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$$(man$2dir)
+       $(Q)$(INSTALL) -m 644 $$(DOC_MAN$2) $(DESTDIR)$$(man$2dir)
+
+docs-uninstall-$1:
+       $$(call QUIET_UNINST, eBPF_$1-manpage)
+       $(Q)$(RM) $$(addprefix $(DESTDIR)$$(man$2dir)/,$$(_DOC_MAN$2))
+       $(Q)$(RMDIR) $(DESTDIR)$$(man$2dir)
+
+.PHONY: $1 docs-clean-$1 docs-install-$1 docs-uninstall-$1
+endef
+
+# Create the make targets to generate manual pages by name and section
+$(eval $(call DOCS_RULES,helpers,7))
+$(eval $(call DOCS_RULES,syscall,2))
+
+docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget))
+docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget))
+docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget))
+
+.PHONY: docs docs-clean docs-install docs-uninstall man2 man7
index fd148b8..3464161 100644 (file)
@@ -111,6 +111,45 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk.
 
 __  https://reviews.llvm.org/D78466
 
+bpf_verif_scale/loop6.o test failure with Clang 12
+==================================================
+
+With Clang 12, the following bpf_verif_scale test failed:
+  * ``bpf_verif_scale/loop6.o``
+
+The verifier output looks like
+
+.. code-block:: c
+
+  R1 type=ctx expected=fp
+  The sequence of 8193 jumps is too complex.
+
+The reason is compiler generating the following code
+
+.. code-block:: c
+
+  ;       for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) {
+      14:       16 05 40 00 00 00 00 00 if w5 == 0 goto +64 <LBB0_6>
+      15:       bc 51 00 00 00 00 00 00 w1 = w5
+      16:       04 01 00 00 ff ff ff ff w1 += -1
+      17:       67 05 00 00 20 00 00 00 r5 <<= 32
+      18:       77 05 00 00 20 00 00 00 r5 >>= 32
+      19:       a6 01 01 00 05 00 00 00 if w1 < 5 goto +1 <LBB0_4>
+      20:       b7 05 00 00 06 00 00 00 r5 = 6
+  00000000000000a8 <LBB0_4>:
+      21:       b7 02 00 00 00 00 00 00 r2 = 0
+      22:       b7 01 00 00 00 00 00 00 r1 = 0
+  ;       for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) {
+      23:       7b 1a e0 ff 00 00 00 00 *(u64 *)(r10 - 32) = r1
+      24:       7b 5a c0 ff 00 00 00 00 *(u64 *)(r10 - 64) = r5
+
+Note that insn #15 has w1 = w5 and w1 is refined later but
+r5(w5) is eventually saved on stack at insn #24 for later use.
+This cause later verifier failure. The bug has been `fixed`__ in
+Clang 13.
+
+__  https://reviews.llvm.org/D97479
+
 BPF CO-RE-based tests and Clang version
 =======================================
 
@@ -131,3 +170,12 @@ failures:
 .. _2: https://reviews.llvm.org/D85174
 .. _3: https://reviews.llvm.org/D83878
 .. _4: https://reviews.llvm.org/D83242
+
+Floating-point tests and Clang version
+======================================
+
+Certain selftests, e.g. core_reloc, require support for the floating-point
+types, which was introduced in `Clang 13`__. The older Clang versions will
+either crash when compiling these tests, or generate an incorrect BTF.
+
+__  https://reviews.llvm.org/D83289
index 48f9049..b692e6e 100644 (file)
@@ -23,6 +23,7 @@ static const char * const btf_kind_str_mapping[] = {
        [BTF_KIND_FUNC_PROTO]   = "FUNC_PROTO",
        [BTF_KIND_VAR]          = "VAR",
        [BTF_KIND_DATASEC]      = "DATASEC",
+       [BTF_KIND_FLOAT]        = "FLOAT",
 };
 
 static const char *btf_kind_str(__u16 kind)
@@ -173,6 +174,9 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id)
                }
                break;
        }
+       case BTF_KIND_FLOAT:
+               fprintf(out, " size=%u", t->size);
+               break;
        default:
                break;
        }
index a0ee87c..9dc4e3d 100644 (file)
@@ -2,6 +2,44 @@
 #include <test_progs.h>
 #include "test_attach_probe.skel.h"
 
+#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2
+
+#define OP_RT_RA_MASK   0xffff0000UL
+#define LIS_R2          0x3c400000UL
+#define ADDIS_R2_R12    0x3c4c0000UL
+#define ADDI_R2_R2      0x38420000UL
+
+static ssize_t get_offset(ssize_t addr, ssize_t base)
+{
+       u32 *insn = (u32 *) addr;
+
+       /*
+        * A PPC64 ABIv2 function may have a local and a global entry
+        * point. We need to use the local entry point when patching
+        * functions, so identify and step over the global entry point
+        * sequence.
+        *
+        * The global entry point sequence is always of the form:
+        *
+        * addis r2,r12,XXXX
+        * addi  r2,r2,XXXX
+        *
+        * A linker optimisation may convert the addis to lis:
+        *
+        * lis   r2,XXXX
+        * addi  r2,r2,XXXX
+        */
+       if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+            ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
+           ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2))
+               return (ssize_t)(insn + 2) - base;
+       else
+               return addr - base;
+}
+#else
+#define get_offset(addr, base) (addr - base)
+#endif
+
 ssize_t get_base_addr() {
        size_t start, offset;
        char buf[256];
@@ -36,7 +74,7 @@ void test_attach_probe(void)
        if (CHECK(base_addr < 0, "get_base_addr",
                  "failed to find base addr: %zd", base_addr))
                return;
-       uprobe_offset = (size_t)&get_base_addr - base_addr;
+       uprobe_offset = get_offset((size_t)&get_base_addr, base_addr);
 
        skel = test_attach_probe__open_and_load();
        if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
index e698ee6..3d002c2 100644 (file)
@@ -76,6 +76,7 @@ void test_bpf_verif_scale(void)
                { "loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
                { "loop4.o", BPF_PROG_TYPE_SCHED_CLS },
                { "loop5.o", BPF_PROG_TYPE_SCHED_CLS },
+               { "loop6.o", BPF_PROG_TYPE_KPROBE },
 
                /* partial unroll. 19k insn in a loop.
                 * Total program size 20.8k insn.
index 6a7ee74..0457ae3 100644 (file)
@@ -1903,7 +1903,7 @@ static struct btf_raw_test raw_tests[] = {
        .raw_types = {
                /* int */                               /* [1] */
                BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
-               BTF_TYPE_ENC(0, 0x10000000, 4),
+               BTF_TYPE_ENC(0, 0x20000000, 4),
                BTF_END_RAW,
        },
        .str_sec = "",
@@ -3531,6 +3531,136 @@ static struct btf_raw_test raw_tests[] = {
        .max_entries = 1,
 },
 
+{
+       .descr = "float test #1, well-formed",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+                                                               /* [1] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 2),                /* [2] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 4),                /* [3] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 8),                /* [4] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 12),               /* [5] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 16),               /* [6] */
+               BTF_STRUCT_ENC(NAME_TBD, 5, 48),                /* [7] */
+               BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+               BTF_MEMBER_ENC(NAME_TBD, 3, 32),
+               BTF_MEMBER_ENC(NAME_TBD, 4, 64),
+               BTF_MEMBER_ENC(NAME_TBD, 5, 128),
+               BTF_MEMBER_ENC(NAME_TBD, 6, 256),
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0int\0_Float16\0float\0double\0_Float80\0long_double"
+                   "\0floats\0a\0b\0c\0d\0e"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "float_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 48,
+       .key_type_id = 1,
+       .value_type_id = 7,
+       .max_entries = 1,
+},
+{
+       .descr = "float test #2, invalid vlen",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+                                                               /* [1] */
+               BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 1), 4),
+                                                               /* [2] */
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0int\0float"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "float_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 4,
+       .key_type_id = 1,
+       .value_type_id = 2,
+       .max_entries = 1,
+       .btf_load_err = true,
+       .err_str = "vlen != 0",
+},
+{
+       .descr = "float test #3, invalid kind_flag",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+                                                               /* [1] */
+               BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 1, 0), 4),
+                                                               /* [2] */
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0int\0float"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "float_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 4,
+       .key_type_id = 1,
+       .value_type_id = 2,
+       .max_entries = 1,
+       .btf_load_err = true,
+       .err_str = "Invalid btf_info kind_flag",
+},
+{
+       .descr = "float test #4, member does not fit",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+                                                               /* [1] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 4),                /* [2] */
+               BTF_STRUCT_ENC(NAME_TBD, 1, 2),                 /* [3] */
+               BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0int\0float\0floats\0x"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "float_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 4,
+       .key_type_id = 1,
+       .value_type_id = 3,
+       .max_entries = 1,
+       .btf_load_err = true,
+       .err_str = "Member exceeds struct_size",
+},
+{
+       .descr = "float test #5, member is not properly aligned",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+                                                               /* [1] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 4),                /* [2] */
+               BTF_STRUCT_ENC(NAME_TBD, 1, 8),                 /* [3] */
+               BTF_MEMBER_ENC(NAME_TBD, 2, 8),
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0int\0float\0floats\0x"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "float_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 4,
+       .key_type_id = 1,
+       .value_type_id = 3,
+       .max_entries = 1,
+       .btf_load_err = true,
+       .err_str = "Member is not properly aligned",
+},
+{
+       .descr = "float test #6, invalid size",
+       .raw_types = {
+               BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+                                                               /* [1] */
+               BTF_TYPE_FLOAT_ENC(NAME_TBD, 6),                /* [2] */
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0int\0float"),
+       .map_type = BPF_MAP_TYPE_ARRAY,
+       .map_name = "float_type_check_btf",
+       .key_size = sizeof(int),
+       .value_size = 6,
+       .key_type_id = 1,
+       .value_type_id = 2,
+       .max_entries = 1,
+       .btf_load_err = true,
+       .err_str = "Invalid type_size",
+},
+
 }; /* struct btf_raw_test raw_tests[] */
 
 static const char *get_next_str(const char *start, const char *end)
@@ -6281,11 +6411,12 @@ const struct btf_dedup_test dedup_tests[] = {
                        /* int[16] */
                        BTF_TYPE_ARRAY_ENC(1, 1, 16),                                   /* [2] */
                        /* struct s { */
-                       BTF_STRUCT_ENC(NAME_NTH(2), 4, 84),                             /* [3] */
+                       BTF_STRUCT_ENC(NAME_NTH(2), 5, 88),                             /* [3] */
                                BTF_MEMBER_ENC(NAME_NTH(3), 4, 0),      /* struct s *next;      */
                                BTF_MEMBER_ENC(NAME_NTH(4), 5, 64),     /* const int *a;        */
                                BTF_MEMBER_ENC(NAME_NTH(5), 2, 128),    /* int b[16];           */
                                BTF_MEMBER_ENC(NAME_NTH(6), 1, 640),    /* int c;               */
+                               BTF_MEMBER_ENC(NAME_NTH(8), 13, 672),   /* float d;             */
                        /* ptr -> [3] struct s */
                        BTF_PTR_ENC(3),                                                 /* [4] */
                        /* ptr -> [6] const int */
@@ -6296,39 +6427,43 @@ const struct btf_dedup_test dedup_tests[] = {
                        /* full copy of the above */
                        BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),        /* [7] */
                        BTF_TYPE_ARRAY_ENC(7, 7, 16),                                   /* [8] */
-                       BTF_STRUCT_ENC(NAME_NTH(2), 4, 84),                             /* [9] */
+                       BTF_STRUCT_ENC(NAME_NTH(2), 5, 88),                             /* [9] */
                                BTF_MEMBER_ENC(NAME_NTH(3), 10, 0),
                                BTF_MEMBER_ENC(NAME_NTH(4), 11, 64),
                                BTF_MEMBER_ENC(NAME_NTH(5), 8, 128),
                                BTF_MEMBER_ENC(NAME_NTH(6), 7, 640),
+                               BTF_MEMBER_ENC(NAME_NTH(8), 13, 672),
                        BTF_PTR_ENC(9),                                                 /* [10] */
                        BTF_PTR_ENC(12),                                                /* [11] */
                        BTF_CONST_ENC(7),                                               /* [12] */
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4),                             /* [13] */
                        BTF_END_RAW,
                },
-               BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"),
+               BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0float\0d"),
        },
        .expect = {
                .raw_types = {
                        /* int */
-                       BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4),        /* [1] */
+                       BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4),        /* [1] */
                        /* int[16] */
                        BTF_TYPE_ARRAY_ENC(1, 1, 16),                                   /* [2] */
                        /* struct s { */
-                       BTF_STRUCT_ENC(NAME_NTH(6), 4, 84),                             /* [3] */
-                               BTF_MEMBER_ENC(NAME_NTH(5), 4, 0),      /* struct s *next;      */
+                       BTF_STRUCT_ENC(NAME_NTH(8), 5, 88),                             /* [3] */
+                               BTF_MEMBER_ENC(NAME_NTH(7), 4, 0),      /* struct s *next;      */
                                BTF_MEMBER_ENC(NAME_NTH(1), 5, 64),     /* const int *a;        */
                                BTF_MEMBER_ENC(NAME_NTH(2), 2, 128),    /* int b[16];           */
                                BTF_MEMBER_ENC(NAME_NTH(3), 1, 640),    /* int c;               */
+                               BTF_MEMBER_ENC(NAME_NTH(4), 7, 672),    /* float d;             */
                        /* ptr -> [3] struct s */
                        BTF_PTR_ENC(3),                                                 /* [4] */
                        /* ptr -> [6] const int */
                        BTF_PTR_ENC(6),                                                 /* [5] */
                        /* const -> [1] int */
                        BTF_CONST_ENC(1),                                               /* [6] */
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4),                             /* [7] */
                        BTF_END_RAW,
                },
-               BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"),
+               BTF_STR_SEC("\0a\0b\0c\0d\0int\0float\0next\0s"),
        },
        .opts = {
                .dont_resolve_fwds = false,
@@ -6449,9 +6584,10 @@ const struct btf_dedup_test dedup_tests[] = {
                                BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
                                BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
                        BTF_FUNC_ENC(NAME_TBD, 12),                                     /* [13] func */
+                       BTF_TYPE_FLOAT_ENC(NAME_TBD, 2),                                /* [14] float */
                        BTF_END_RAW,
                },
-               BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"),
+               BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"),
        },
        .expect = {
                .raw_types = {
@@ -6474,16 +6610,17 @@ const struct btf_dedup_test dedup_tests[] = {
                                BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
                                BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
                        BTF_FUNC_ENC(NAME_TBD, 12),                                     /* [13] func */
+                       BTF_TYPE_FLOAT_ENC(NAME_TBD, 2),                                /* [14] float */
                        BTF_END_RAW,
                },
-               BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"),
+               BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"),
        },
        .opts = {
                .dont_resolve_fwds = false,
        },
 },
 {
-       .descr = "dedup: no int duplicates",
+       .descr = "dedup: no int/float duplicates",
        .input = {
                .raw_types = {
                        BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8),
@@ -6498,9 +6635,15 @@ const struct btf_dedup_test dedup_tests[] = {
                        BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8),
                        /* different byte size */
                        BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+                       /* all allowed sizes */
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16),
                        BTF_END_RAW,
                },
-               BTF_STR_SEC("\0int\0some other int"),
+               BTF_STR_SEC("\0int\0some other int\0float"),
        },
        .expect = {
                .raw_types = {
@@ -6516,9 +6659,15 @@ const struct btf_dedup_test dedup_tests[] = {
                        BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8),
                        /* different byte size */
                        BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+                       /* all allowed sizes */
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12),
+                       BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16),
                        BTF_END_RAW,
                },
-               BTF_STR_SEC("\0int\0some other int"),
+               BTF_STR_SEC("\0int\0some other int\0float"),
        },
        .opts = {
                .dont_resolve_fwds = false,
@@ -6630,6 +6779,7 @@ static int btf_type_size(const struct btf_type *t)
        case BTF_KIND_PTR:
        case BTF_KIND_TYPEDEF:
        case BTF_KIND_FUNC:
+       case BTF_KIND_FLOAT:
                return base_size;
        case BTF_KIND_INT:
                return base_size + sizeof(__u32);
index 06eb956..d94dcea 100644 (file)
@@ -266,6 +266,7 @@ static int duration = 0;
                .arr_elem_sz = sizeof(((type *)0)->arr_field[0]),       \
                .ptr_sz = 8, /* always 8-byte pointer for BPF */        \
                .enum_sz = sizeof(((type *)0)->enum_field),             \
+               .float_sz = sizeof(((type *)0)->float_field),           \
        }
 
 #define SIZE_CASE(name) {                                              \
diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c
new file mode 100644 (file)
index 0000000..68eb12a
--- /dev/null
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "for_each_hash_map_elem.skel.h"
+#include "for_each_array_map_elem.skel.h"
+
+static unsigned int duration;
+
+static void test_hash_map(void)
+{
+       int i, err, hashmap_fd, max_entries, percpu_map_fd;
+       struct for_each_hash_map_elem *skel;
+       __u64 *percpu_valbuf = NULL;
+       __u32 key, num_cpus, retval;
+       __u64 val;
+
+       skel = for_each_hash_map_elem__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "for_each_hash_map_elem__open_and_load"))
+               return;
+
+       hashmap_fd = bpf_map__fd(skel->maps.hashmap);
+       max_entries = bpf_map__max_entries(skel->maps.hashmap);
+       for (i = 0; i < max_entries; i++) {
+               key = i;
+               val = i + 1;
+               err = bpf_map_update_elem(hashmap_fd, &key, &val, BPF_ANY);
+               if (!ASSERT_OK(err, "map_update"))
+                       goto out;
+       }
+
+       num_cpus = bpf_num_possible_cpus();
+       percpu_map_fd = bpf_map__fd(skel->maps.percpu_map);
+       percpu_valbuf = malloc(sizeof(__u64) * num_cpus);
+       if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf"))
+               goto out;
+
+       key = 1;
+       for (i = 0; i < num_cpus; i++)
+               percpu_valbuf[i] = i + 1;
+       err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY);
+       if (!ASSERT_OK(err, "percpu_map_update"))
+               goto out;
+
+       err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access),
+                               1, &pkt_v4, sizeof(pkt_v4), NULL, NULL,
+                               &retval, &duration);
+       if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n",
+                 err, errno, retval))
+               goto out;
+
+       ASSERT_EQ(skel->bss->hashmap_output, 4, "hashmap_output");
+       ASSERT_EQ(skel->bss->hashmap_elems, max_entries, "hashmap_elems");
+
+       key = 1;
+       err = bpf_map_lookup_elem(hashmap_fd, &key, &val);
+       ASSERT_ERR(err, "hashmap_lookup");
+
+       ASSERT_EQ(skel->bss->percpu_called, 1, "percpu_called");
+       ASSERT_LT(skel->bss->cpu, num_cpus, "num_cpus");
+       ASSERT_EQ(skel->bss->percpu_map_elems, 1, "percpu_map_elems");
+       ASSERT_EQ(skel->bss->percpu_key, 1, "percpu_key");
+       ASSERT_EQ(skel->bss->percpu_val, skel->bss->cpu + 1, "percpu_val");
+       ASSERT_EQ(skel->bss->percpu_output, 100, "percpu_output");
+out:
+       free(percpu_valbuf);
+       for_each_hash_map_elem__destroy(skel);
+}
+
+static void test_array_map(void)
+{
+       __u32 key, num_cpus, max_entries, retval;
+       int i, arraymap_fd, percpu_map_fd, err;
+       struct for_each_array_map_elem *skel;
+       __u64 *percpu_valbuf = NULL;
+       __u64 val, expected_total;
+
+       skel = for_each_array_map_elem__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "for_each_array_map_elem__open_and_load"))
+               return;
+
+       arraymap_fd = bpf_map__fd(skel->maps.arraymap);
+       expected_total = 0;
+       max_entries = bpf_map__max_entries(skel->maps.arraymap);
+       for (i = 0; i < max_entries; i++) {
+               key = i;
+               val = i + 1;
+               /* skip the last iteration for expected total */
+               if (i != max_entries - 1)
+                       expected_total += val;
+               err = bpf_map_update_elem(arraymap_fd, &key, &val, BPF_ANY);
+               if (!ASSERT_OK(err, "map_update"))
+                       goto out;
+       }
+
+       num_cpus = bpf_num_possible_cpus();
+       percpu_map_fd = bpf_map__fd(skel->maps.percpu_map);
+       percpu_valbuf = malloc(sizeof(__u64) * num_cpus);
+       if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf"))
+               goto out;
+
+       key = 0;
+       for (i = 0; i < num_cpus; i++)
+               percpu_valbuf[i] = i + 1;
+       err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY);
+       if (!ASSERT_OK(err, "percpu_map_update"))
+               goto out;
+
+       err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access),
+                               1, &pkt_v4, sizeof(pkt_v4), NULL, NULL,
+                               &retval, &duration);
+       if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n",
+                 err, errno, retval))
+               goto out;
+
+       ASSERT_EQ(skel->bss->arraymap_output, expected_total, "array_output");
+       ASSERT_EQ(skel->bss->cpu + 1, skel->bss->percpu_val, "percpu_val");
+
+out:
+       free(percpu_valbuf);
+       for_each_array_map_elem__destroy(skel);
+}
+
+void test_for_each(void)
+{
+       if (test__start_subtest("hash_map"))
+               test_hash_map();
+       if (test__start_subtest("array_map"))
+               test_array_map();
+}
index 935a294..131d7f7 100644 (file)
@@ -2,12 +2,31 @@
 #include <test_progs.h>
 #include <network_helpers.h>
 
-void test_prog_run_xattr(void)
+#include "test_pkt_access.skel.h"
+
+static const __u32 duration;
+
+static void check_run_cnt(int prog_fd, __u64 run_cnt)
 {
-       const char *file = "./test_pkt_access.o";
-       struct bpf_object *obj;
-       char buf[10];
+       struct bpf_prog_info info = {};
+       __u32 info_len = sizeof(info);
        int err;
+
+       err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+       if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd))
+               return;
+
+       CHECK(run_cnt != info.run_cnt, "run_cnt",
+             "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt);
+}
+
+void test_prog_run_xattr(void)
+{
+       struct test_pkt_access *skel;
+       int err, stats_fd = -1;
+       char buf[10] = {};
+       __u64 run_cnt = 0;
+
        struct bpf_prog_test_run_attr tattr = {
                .repeat = 1,
                .data_in = &pkt_v4,
@@ -16,12 +35,15 @@ void test_prog_run_xattr(void)
                .data_size_out = 5,
        };
 
-       err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj,
-                           &tattr.prog_fd);
-       if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+       stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
+       if (CHECK_ATTR(stats_fd < 0, "enable_stats", "failed %d\n", errno))
                return;
 
-       memset(buf, 0, sizeof(buf));
+       skel = test_pkt_access__open_and_load();
+       if (CHECK_ATTR(!skel, "open_and_load", "failed\n"))
+               goto cleanup;
+
+       tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access);
 
        err = bpf_prog_test_run_xattr(&tattr);
        CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run",
@@ -34,8 +56,12 @@ void test_prog_run_xattr(void)
        CHECK_ATTR(buf[5] != 0, "overflow",
              "BPF_PROG_TEST_RUN ignored size hint\n");
 
+       run_cnt += tattr.repeat;
+       check_run_cnt(tattr.prog_fd, run_cnt);
+
        tattr.data_out = NULL;
        tattr.data_size_out = 0;
+       tattr.repeat = 2;
        errno = 0;
 
        err = bpf_prog_test_run_xattr(&tattr);
@@ -46,5 +72,12 @@ void test_prog_run_xattr(void)
        err = bpf_prog_test_run_xattr(&tattr);
        CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err);
 
-       bpf_object__close(obj);
+       run_cnt += tattr.repeat;
+       check_run_cnt(tattr.prog_fd, run_cnt);
+
+cleanup:
+       if (skel)
+               test_pkt_access__destroy(skel);
+       if (stats_fd != -1)
+               close(stats_fd);
 }
index 9ff0412..45c82db 100644 (file)
@@ -241,6 +241,48 @@ fail:
        return -1;
 }
 
+static __u64 socket_cookie(int fd)
+{
+       __u64 cookie;
+       socklen_t cookie_len = sizeof(cookie);
+
+       if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0,
+                 "getsockopt(SO_COOKIE)", "%s\n", strerror(errno)))
+               return 0;
+       return cookie;
+}
+
+static int fill_sk_lookup_ctx(struct bpf_sk_lookup *ctx, const char *local_ip, __u16 local_port,
+                             const char *remote_ip, __u16 remote_port)
+{
+       void *local, *remote;
+       int err;
+
+       memset(ctx, 0, sizeof(*ctx));
+       ctx->local_port = local_port;
+       ctx->remote_port = htons(remote_port);
+
+       if (is_ipv6(local_ip)) {
+               ctx->family = AF_INET6;
+               local = &ctx->local_ip6[0];
+               remote = &ctx->remote_ip6[0];
+       } else {
+               ctx->family = AF_INET;
+               local = &ctx->local_ip4;
+               remote = &ctx->remote_ip4;
+       }
+
+       err = inet_pton(ctx->family, local_ip, local);
+       if (CHECK(err != 1, "inet_pton", "local_ip failed\n"))
+               return 1;
+
+       err = inet_pton(ctx->family, remote_ip, remote);
+       if (CHECK(err != 1, "inet_pton", "remote_ip failed\n"))
+               return 1;
+
+       return 0;
+}
+
 static int send_byte(int fd)
 {
        ssize_t n;
@@ -1009,18 +1051,27 @@ static void test_drop_on_reuseport(struct test_sk_lookup *skel)
 
 static void run_sk_assign(struct test_sk_lookup *skel,
                          struct bpf_program *lookup_prog,
-                         const char *listen_ip, const char *connect_ip)
+                         const char *remote_ip, const char *local_ip)
 {
-       int client_fd, peer_fd, server_fds[MAX_SERVERS] = { -1 };
-       struct bpf_link *lookup_link;
+       int server_fds[MAX_SERVERS] = { -1 };
+       struct bpf_sk_lookup ctx;
+       __u64 server_cookie;
        int i, err;
 
-       lookup_link = attach_lookup_prog(lookup_prog);
-       if (!lookup_link)
+       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+               .ctx_in = &ctx,
+               .ctx_size_in = sizeof(ctx),
+               .ctx_out = &ctx,
+               .ctx_size_out = sizeof(ctx),
+       );
+
+       if (fill_sk_lookup_ctx(&ctx, local_ip, EXT_PORT, remote_ip, INT_PORT))
                return;
 
+       ctx.protocol = IPPROTO_TCP;
+
        for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
-               server_fds[i] = make_server(SOCK_STREAM, listen_ip, 0, NULL);
+               server_fds[i] = make_server(SOCK_STREAM, local_ip, 0, NULL);
                if (server_fds[i] < 0)
                        goto close_servers;
 
@@ -1030,23 +1081,25 @@ static void run_sk_assign(struct test_sk_lookup *skel,
                        goto close_servers;
        }
 
-       client_fd = make_client(SOCK_STREAM, connect_ip, EXT_PORT);
-       if (client_fd < 0)
+       server_cookie = socket_cookie(server_fds[SERVER_B]);
+       if (!server_cookie)
+               return;
+
+       err = bpf_prog_test_run_opts(bpf_program__fd(lookup_prog), &opts);
+       if (CHECK(err, "test_run", "failed with error %d\n", errno))
+               goto close_servers;
+
+       if (CHECK(ctx.cookie == 0, "ctx.cookie", "no socket selected\n"))
                goto close_servers;
 
-       peer_fd = accept(server_fds[SERVER_B], NULL, NULL);
-       if (CHECK(peer_fd < 0, "accept", "failed\n"))
-               goto close_client;
+       CHECK(ctx.cookie != server_cookie, "ctx.cookie",
+             "selected sk %llu instead of %llu\n", ctx.cookie, server_cookie);
 
-       close(peer_fd);
-close_client:
-       close(client_fd);
 close_servers:
        for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
                if (server_fds[i] != -1)
                        close(server_fds[i]);
        }
-       bpf_link__destroy(lookup_link);
 }
 
 static void run_sk_assign_v4(struct test_sk_lookup *skel,
index d7d65a7..c26e6bf 100644 (file)
@@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel,
                                        struct bpf_map *inner_map, int family,
                                        int sotype)
 {
-       int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
-       int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+       int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+       int parser = bpf_program__fd(skel->progs.prog_stream_parser);
        int verdict_map = bpf_map__fd(skel->maps.verdict_map);
        int sock_map = bpf_map__fd(inner_map);
        int err;
@@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel,
                                        struct bpf_map *inner_map, int family,
                                        int sotype)
 {
-       int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
-       int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+       int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+       int parser = bpf_program__fd(skel->progs.prog_stream_parser);
        int verdict_map = bpf_map__fd(skel->maps.verdict_map);
        int sock_map = bpf_map__fd(inner_map);
        int err;
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
new file mode 100644 (file)
index 0000000..035c263
--- /dev/null
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sys/types.h>
+#include <test_progs.h>
+#include "task_local_storage.skel.h"
+#include "task_local_storage_exit_creds.skel.h"
+#include "task_ls_recursion.skel.h"
+
+static void test_sys_enter_exit(void)
+{
+       struct task_local_storage *skel;
+       int err;
+
+       skel = task_local_storage__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+               return;
+
+       skel->bss->target_pid = syscall(SYS_gettid);
+
+       err = task_local_storage__attach(skel);
+       if (!ASSERT_OK(err, "skel_attach"))
+               goto out;
+
+       syscall(SYS_gettid);
+       syscall(SYS_gettid);
+
+       /* 3x syscalls: 1x attach and 2x gettid */
+       ASSERT_EQ(skel->bss->enter_cnt, 3, "enter_cnt");
+       ASSERT_EQ(skel->bss->exit_cnt, 3, "exit_cnt");
+       ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt");
+out:
+       task_local_storage__destroy(skel);
+}
+
+static void test_exit_creds(void)
+{
+       struct task_local_storage_exit_creds *skel;
+       int err;
+
+       skel = task_local_storage_exit_creds__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+               return;
+
+       err = task_local_storage_exit_creds__attach(skel);
+       if (!ASSERT_OK(err, "skel_attach"))
+               goto out;
+
+       /* trigger at least one exit_creds() */
+       if (CHECK_FAIL(system("ls > /dev/null")))
+               goto out;
+
+       /* sync rcu to make sure exit_creds() is called for "ls" */
+       kern_sync_rcu();
+       ASSERT_EQ(skel->bss->valid_ptr_count, 0, "valid_ptr_count");
+       ASSERT_NEQ(skel->bss->null_ptr_count, 0, "null_ptr_count");
+out:
+       task_local_storage_exit_creds__destroy(skel);
+}
+
+static void test_recursion(void)
+{
+       struct task_ls_recursion *skel;
+       int err;
+
+       skel = task_ls_recursion__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+               return;
+
+       err = task_ls_recursion__attach(skel);
+       if (!ASSERT_OK(err, "skel_attach"))
+               goto out;
+
+       /* trigger sys_enter, make sure it does not cause deadlock */
+       syscall(SYS_gettid);
+
+out:
+       task_ls_recursion__destroy(skel);
+}
+
+void test_task_local_storage(void)
+{
+       if (test__start_subtest("sys_enter_exit"))
+               test_sys_enter_exit();
+       if (test__start_subtest("exit_creds"))
+               test_exit_creds();
+       if (test__start_subtest("recursion"))
+               test_recursion();
+}
index 31975c9..12b40dc 100644 (file)
@@ -205,6 +205,12 @@ struct struct_with_embedded_stuff {
        int t[11];
 };
 
+struct float_struct {
+       float f;
+       const double *d;
+       volatile long double *ld;
+};
+
 struct root_struct {
        enum e1 _1;
        enum e2 _2;
@@ -219,6 +225,7 @@ struct root_struct {
        union_fwd_t *_12;
        union_fwd_ptr_t _13;
        struct struct_with_embedded_stuff _14;
+       struct float_struct _15;
 };
 
 /* ------ END-EXPECTED-OUTPUT ------ */
index 9a28508..9982eb9 100644 (file)
@@ -807,6 +807,7 @@ struct core_reloc_size_output {
        int arr_elem_sz;
        int ptr_sz;
        int enum_sz;
+       int float_sz;
 };
 
 struct core_reloc_size {
@@ -816,6 +817,7 @@ struct core_reloc_size {
        int arr_field[4];
        void *ptr_field;
        enum { VALUE = 123 } enum_field;
+       float float_field;
 };
 
 struct core_reloc_size___diff_sz {
@@ -825,6 +827,7 @@ struct core_reloc_size___diff_sz {
        char arr_field[10];
        void *ptr_field;
        enum { OTHER_VALUE = 0xFFFFFFFFFFFFFFFF } enum_field;
+       double float_field;
 };
 
 /* Error case of two candidates with the fields (int_field) at the same
@@ -839,6 +842,7 @@ struct core_reloc_size___err_ambiguous1 {
        int arr_field[4];
        void *ptr_field;
        enum { VALUE___1 = 123 } enum_field;
+       float float_field;
 };
 
 struct core_reloc_size___err_ambiguous2 {
@@ -850,6 +854,7 @@ struct core_reloc_size___err_ambiguous2 {
        int arr_field[4];
        void *ptr_field;
        enum { VALUE___2 = 123 } enum_field;
+       float float_field;
 };
 
 /*
diff --git a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c
new file mode 100644 (file)
index 0000000..75e8e10
--- /dev/null
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 3);
+       __type(key, __u32);
+       __type(value, __u64);
+} arraymap SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, __u32);
+       __type(value, __u64);
+} percpu_map SEC(".maps");
+
+struct callback_ctx {
+       int output;
+};
+
+static __u64
+check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val,
+                struct callback_ctx *data)
+{
+       data->output += *val;
+       if (*key == 1)
+               return 1; /* stop the iteration */
+       return 0;
+}
+
+__u32 cpu = 0;
+__u64 percpu_val = 0;
+
+static __u64
+check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val,
+                 struct callback_ctx *data)
+{
+       cpu = bpf_get_smp_processor_id();
+       percpu_val = *val;
+       return 0;
+}
+
+u32 arraymap_output = 0;
+
+SEC("classifier")
+int test_pkt_access(struct __sk_buff *skb)
+{
+       struct callback_ctx data;
+
+       data.output = 0;
+       bpf_for_each_map_elem(&arraymap, check_array_elem, &data, 0);
+       arraymap_output = data.output;
+
+       bpf_for_each_map_elem(&percpu_map, check_percpu_elem, (void *)0, 0);
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c
new file mode 100644 (file)
index 0000000..913dd91
--- /dev/null
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __uint(max_entries, 3);
+       __type(key, __u32);
+       __type(value, __u64);
+} hashmap SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+       __uint(max_entries, 1);
+       __type(key, __u32);
+       __type(value, __u64);
+} percpu_map SEC(".maps");
+
+struct callback_ctx {
+       struct __sk_buff *ctx;
+       int input;
+       int output;
+};
+
+static __u64
+check_hash_elem(struct bpf_map *map, __u32 *key, __u64 *val,
+               struct callback_ctx *data)
+{
+       struct __sk_buff *skb = data->ctx;
+       __u32 k;
+       __u64 v;
+
+       if (skb) {
+               k = *key;
+               v = *val;
+               if (skb->len == 10000 && k == 10 && v == 10)
+                       data->output = 3; /* impossible path */
+               else
+                       data->output = 4;
+       } else {
+               data->output = data->input;
+               bpf_map_delete_elem(map, key);
+       }
+
+       return 0;
+}
+
+__u32 cpu = 0;
+__u32 percpu_called = 0;
+__u32 percpu_key = 0;
+__u64 percpu_val = 0;
+int percpu_output = 0;
+
+static __u64
+check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val,
+                 struct callback_ctx *unused)
+{
+       struct callback_ctx data;
+
+       percpu_called++;
+       cpu = bpf_get_smp_processor_id();
+       percpu_key = *key;
+       percpu_val = *val;
+
+       data.ctx = 0;
+       data.input = 100;
+       data.output = 0;
+       bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0);
+       percpu_output = data.output;
+
+       return 0;
+}
+
+int hashmap_output = 0;
+int hashmap_elems = 0;
+int percpu_map_elems = 0;
+
+SEC("classifier")
+int test_pkt_access(struct __sk_buff *skb)
+{
+       struct callback_ctx data;
+
+       data.ctx = skb;
+       data.input = 10;
+       data.output = 0;
+       hashmap_elems = bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0);
+       hashmap_output = data.output;
+
+       percpu_map_elems = bpf_for_each_map_elem(&percpu_map, check_percpu_elem,
+                                                (void *)0, 0);
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c
new file mode 100644 (file)
index 0000000..38de033
--- /dev/null
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* typically virtio scsi has max SGs of 6 */
+#define VIRTIO_MAX_SGS 6
+
+/* Verifier will fail with SG_MAX = 128. The failure can be
+ * workarounded with a smaller SG_MAX, e.g. 10.
+ */
+#define WORKAROUND
+#ifdef WORKAROUND
+#define SG_MAX         10
+#else
+/* typically virtio blk has max SEG of 128 */
+#define SG_MAX         128
+#endif
+
+#define SG_CHAIN       0x01UL
+#define SG_END         0x02UL
+
+struct scatterlist {
+       unsigned long   page_link;
+       unsigned int    offset;
+       unsigned int    length;
+};
+
+#define sg_is_chain(sg)                ((sg)->page_link & SG_CHAIN)
+#define sg_is_last(sg)         ((sg)->page_link & SG_END)
+#define sg_chain_ptr(sg)       \
+       ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END)))
+
+static inline struct scatterlist *__sg_next(struct scatterlist *sgp)
+{
+       struct scatterlist sg;
+
+       bpf_probe_read_kernel(&sg, sizeof(sg), sgp);
+       if (sg_is_last(&sg))
+               return NULL;
+
+       sgp++;
+
+       bpf_probe_read_kernel(&sg, sizeof(sg), sgp);
+       if (sg_is_chain(&sg))
+               sgp = sg_chain_ptr(&sg);
+
+       return sgp;
+}
+
+static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i)
+{
+       struct scatterlist *sgp;
+
+       bpf_probe_read_kernel(&sgp, sizeof(sgp), sgs + i);
+       return sgp;
+}
+
+int config = 0;
+int result = 0;
+
+SEC("kprobe/virtqueue_add_sgs")
+int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs,
+              unsigned int out_sgs, unsigned int in_sgs)
+{
+       struct scatterlist *sgp = NULL;
+       __u64 length1 = 0, length2 = 0;
+       unsigned int i, n, len;
+
+       if (config != 0)
+               return 0;
+
+       for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) {
+               for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX);
+                    sgp = __sg_next(sgp)) {
+                       bpf_probe_read_kernel(&len, sizeof(len), &sgp->length);
+                       length1 += len;
+                       n++;
+               }
+       }
+
+       for (i = 0; (i < VIRTIO_MAX_SGS) && (i < in_sgs); i++) {
+               for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX);
+                    sgp = __sg_next(sgp)) {
+                       bpf_probe_read_kernel(&len, sizeof(len), &sgp->length);
+                       length2 += len;
+                       n++;
+               }
+       }
+
+       config = 1;
+       result = length2 - length1;
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c
new file mode 100644 (file)
index 0000000..80a0a20
--- /dev/null
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, long);
+} enter_id SEC(".maps");
+
+#define MAGIC_VALUE 0xabcd1234
+
+pid_t target_pid = 0;
+int mismatch_cnt = 0;
+int enter_cnt = 0;
+int exit_cnt = 0;
+
+SEC("tp_btf/sys_enter")
+int BPF_PROG(on_enter, struct pt_regs *regs, long id)
+{
+       struct task_struct *task;
+       long *ptr;
+
+       task = bpf_get_current_task_btf();
+       if (task->pid != target_pid)
+               return 0;
+
+       ptr = bpf_task_storage_get(&enter_id, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (!ptr)
+               return 0;
+
+       __sync_fetch_and_add(&enter_cnt, 1);
+       *ptr = MAGIC_VALUE + enter_cnt;
+
+       return 0;
+}
+
+SEC("tp_btf/sys_exit")
+int BPF_PROG(on_exit, struct pt_regs *regs, long id)
+{
+       struct task_struct *task;
+       long *ptr;
+
+       task = bpf_get_current_task_btf();
+       if (task->pid != target_pid)
+               return 0;
+
+       ptr = bpf_task_storage_get(&enter_id, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (!ptr)
+               return 0;
+
+       __sync_fetch_and_add(&exit_cnt, 1);
+       if (*ptr != MAGIC_VALUE + exit_cnt)
+               __sync_fetch_and_add(&mismatch_cnt, 1);
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c
new file mode 100644 (file)
index 0000000..81758c0
--- /dev/null
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, __u64);
+} task_storage SEC(".maps");
+
+int valid_ptr_count = 0;
+int null_ptr_count = 0;
+
+SEC("fentry/exit_creds")
+int BPF_PROG(trace_exit_creds, struct task_struct *task)
+{
+       __u64 *ptr;
+
+       ptr = bpf_task_storage_get(&task_storage, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               __sync_fetch_and_add(&valid_ptr_count, 1);
+       else
+               __sync_fetch_and_add(&null_ptr_count, 1);
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c
new file mode 100644 (file)
index 0000000..564583d
--- /dev/null
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, long);
+} map_a SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, long);
+} map_b SEC(".maps");
+
+SEC("fentry/bpf_local_storage_lookup")
+int BPF_PROG(on_lookup)
+{
+       struct task_struct *task = bpf_get_current_task_btf();
+
+       bpf_task_storage_delete(&map_a, task);
+       bpf_task_storage_delete(&map_b, task);
+       return 0;
+}
+
+SEC("fentry/bpf_local_storage_update")
+int BPF_PROG(on_update)
+{
+       struct task_struct *task = bpf_get_current_task_btf();
+       long *ptr;
+
+       ptr = bpf_task_storage_get(&map_a, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               *ptr += 1;
+
+       ptr = bpf_task_storage_get(&map_b, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               *ptr += 1;
+
+       return 0;
+}
+
+SEC("tp_btf/sys_enter")
+int BPF_PROG(on_enter, struct pt_regs *regs, long id)
+{
+       struct task_struct *task;
+       long *ptr;
+
+       task = bpf_get_current_task_btf();
+       ptr = bpf_task_storage_get(&map_a, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               *ptr = 200;
+
+       ptr = bpf_task_storage_get(&map_b, task, 0,
+                                  BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (ptr)
+               *ptr = 100;
+       return 0;
+}
index d7fb6cf..7b2d576 100644 (file)
@@ -21,6 +21,7 @@ struct core_reloc_size_output {
        int arr_elem_sz;
        int ptr_sz;
        int enum_sz;
+       int float_sz;
 };
 
 struct core_reloc_size {
@@ -30,6 +31,7 @@ struct core_reloc_size {
        int arr_field[4];
        void *ptr_field;
        enum { VALUE = 123 } enum_field;
+       float float_field;
 };
 
 SEC("raw_tracepoint/sys_enter")
@@ -45,6 +47,7 @@ int test_core_size(void *ctx)
        out->arr_elem_sz = bpf_core_field_size(in->arr_field[0]);
        out->ptr_sz = bpf_core_field_size(in->ptr_field);
        out->enum_sz = bpf_core_field_size(in->enum_field);
+       out->float_sz = bpf_core_field_size(in->float_field);
 
        return 0;
 }
index 1032b29..ac6f7f2 100644 (file)
@@ -64,6 +64,10 @@ static const int PROG_DONE = 1;
 static const __u32 KEY_SERVER_A = SERVER_A;
 static const __u32 KEY_SERVER_B = SERVER_B;
 
+static const __u16 SRC_PORT = bpf_htons(8008);
+static const __u32 SRC_IP4 = IP4(127, 0, 0, 2);
+static const __u32 SRC_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000002);
+
 static const __u16 DST_PORT = 7007; /* Host byte order */
 static const __u32 DST_IP4 = IP4(127, 0, 0, 1);
 static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001);
@@ -398,11 +402,12 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
        if (LSW(ctx->protocol, 0) != IPPROTO_TCP)
                return SK_DROP;
 
-       /* Narrow loads from remote_port field. Expect non-0 value. */
-       if (LSB(ctx->remote_port, 0) == 0 && LSB(ctx->remote_port, 1) == 0 &&
-           LSB(ctx->remote_port, 2) == 0 && LSB(ctx->remote_port, 3) == 0)
+       /* Narrow loads from remote_port field. Expect SRC_PORT. */
+       if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) ||
+           LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) ||
+           LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0)
                return SK_DROP;
-       if (LSW(ctx->remote_port, 0) == 0)
+       if (LSW(ctx->remote_port, 0) != SRC_PORT)
                return SK_DROP;
 
        /* Narrow loads from local_port field. Expect DST_PORT. */
@@ -415,11 +420,14 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
 
        /* Narrow loads from IPv4 fields */
        if (v4) {
-               /* Expect non-0.0.0.0 in remote_ip4 */
-               if (LSB(ctx->remote_ip4, 0) == 0 && LSB(ctx->remote_ip4, 1) == 0 &&
-                   LSB(ctx->remote_ip4, 2) == 0 && LSB(ctx->remote_ip4, 3) == 0)
+               /* Expect SRC_IP4 in remote_ip4 */
+               if (LSB(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xff) ||
+                   LSB(ctx->remote_ip4, 1) != ((SRC_IP4 >> 8) & 0xff) ||
+                   LSB(ctx->remote_ip4, 2) != ((SRC_IP4 >> 16) & 0xff) ||
+                   LSB(ctx->remote_ip4, 3) != ((SRC_IP4 >> 24) & 0xff))
                        return SK_DROP;
-               if (LSW(ctx->remote_ip4, 0) == 0 && LSW(ctx->remote_ip4, 1) == 0)
+               if (LSW(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xffff) ||
+                   LSW(ctx->remote_ip4, 1) != ((SRC_IP4 >> 16) & 0xffff))
                        return SK_DROP;
 
                /* Expect DST_IP4 in local_ip4 */
@@ -448,20 +456,32 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
 
        /* Narrow loads from IPv6 fields */
        if (!v4) {
-               /* Expect non-:: IP in remote_ip6 */
-               if (LSB(ctx->remote_ip6[0], 0) == 0 && LSB(ctx->remote_ip6[0], 1) == 0 &&
-                   LSB(ctx->remote_ip6[0], 2) == 0 && LSB(ctx->remote_ip6[0], 3) == 0 &&
-                   LSB(ctx->remote_ip6[1], 0) == 0 && LSB(ctx->remote_ip6[1], 1) == 0 &&
-                   LSB(ctx->remote_ip6[1], 2) == 0 && LSB(ctx->remote_ip6[1], 3) == 0 &&
-                   LSB(ctx->remote_ip6[2], 0) == 0 && LSB(ctx->remote_ip6[2], 1) == 0 &&
-                   LSB(ctx->remote_ip6[2], 2) == 0 && LSB(ctx->remote_ip6[2], 3) == 0 &&
-                   LSB(ctx->remote_ip6[3], 0) == 0 && LSB(ctx->remote_ip6[3], 1) == 0 &&
-                   LSB(ctx->remote_ip6[3], 2) == 0 && LSB(ctx->remote_ip6[3], 3) == 0)
+               /* Expect SRC_IP6 in remote_ip6 */
+               if (LSB(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xff) ||
+                   LSB(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 8) & 0xff) ||
+                   LSB(ctx->remote_ip6[0], 2) != ((SRC_IP6[0] >> 16) & 0xff) ||
+                   LSB(ctx->remote_ip6[0], 3) != ((SRC_IP6[0] >> 24) & 0xff) ||
+                   LSB(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xff) ||
+                   LSB(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 8) & 0xff) ||
+                   LSB(ctx->remote_ip6[1], 2) != ((SRC_IP6[1] >> 16) & 0xff) ||
+                   LSB(ctx->remote_ip6[1], 3) != ((SRC_IP6[1] >> 24) & 0xff) ||
+                   LSB(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xff) ||
+                   LSB(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 8) & 0xff) ||
+                   LSB(ctx->remote_ip6[2], 2) != ((SRC_IP6[2] >> 16) & 0xff) ||
+                   LSB(ctx->remote_ip6[2], 3) != ((SRC_IP6[2] >> 24) & 0xff) ||
+                   LSB(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xff) ||
+                   LSB(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 8) & 0xff) ||
+                   LSB(ctx->remote_ip6[3], 2) != ((SRC_IP6[3] >> 16) & 0xff) ||
+                   LSB(ctx->remote_ip6[3], 3) != ((SRC_IP6[3] >> 24) & 0xff))
                        return SK_DROP;
-               if (LSW(ctx->remote_ip6[0], 0) == 0 && LSW(ctx->remote_ip6[0], 1) == 0 &&
-                   LSW(ctx->remote_ip6[1], 0) == 0 && LSW(ctx->remote_ip6[1], 1) == 0 &&
-                   LSW(ctx->remote_ip6[2], 0) == 0 && LSW(ctx->remote_ip6[2], 1) == 0 &&
-                   LSW(ctx->remote_ip6[3], 0) == 0 && LSW(ctx->remote_ip6[3], 1) == 0)
+               if (LSW(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xffff) ||
+                   LSW(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 16) & 0xffff) ||
+                   LSW(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xffff) ||
+                   LSW(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 16) & 0xffff) ||
+                   LSW(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xffff) ||
+                   LSW(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 16) & 0xffff) ||
+                   LSW(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xffff) ||
+                   LSW(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 16) & 0xffff))
                        return SK_DROP;
                /* Expect DST_IP6 in local_ip6 */
                if (LSB(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xff) ||
index a3a366c..fa22114 100644 (file)
@@ -31,13 +31,13 @@ struct {
 static volatile bool test_sockmap; /* toggled by user-space */
 
 SEC("sk_skb/stream_parser")
-int prog_skb_parser(struct __sk_buff *skb)
+int prog_stream_parser(struct __sk_buff *skb)
 {
        return skb->len;
 }
 
 SEC("sk_skb/stream_verdict")
-int prog_skb_verdict(struct __sk_buff *skb)
+int prog_stream_verdict(struct __sk_buff *skb)
 {
        unsigned int *count;
        __u32 zero = 0;
index 37bce7a..84cd632 100644 (file)
@@ -24,14 +24,29 @@ static const int cfg_port = 8000;
 
 static const int cfg_udp_src = 20000;
 
+#define        L2_PAD_SZ       (sizeof(struct vxlanhdr) + ETH_HLEN)
+
 #define        UDP_PORT                5555
 #define        MPLS_OVER_UDP_PORT      6635
 #define        ETH_OVER_UDP_PORT       7777
+#define        VXLAN_UDP_PORT          8472
+
+#define        EXTPROTO_VXLAN  0x1
+
+#define        VXLAN_N_VID     (1u << 24)
+#define        VXLAN_VNI_MASK  bpf_htonl((VXLAN_N_VID - 1) << 8)
+#define        VXLAN_FLAGS     0x8
+#define        VXLAN_VNI       1
 
 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
 static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
                                                     MPLS_LS_S_MASK | 0xff);
 
+struct vxlanhdr {
+       __be32 vx_flags;
+       __be32 vx_vni;
+} __attribute__((packed));
+
 struct gre_hdr {
        __be16 flags;
        __be16 protocol;
@@ -45,13 +60,13 @@ union l4hdr {
 struct v4hdr {
        struct iphdr ip;
        union l4hdr l4hdr;
-       __u8 pad[16];                   /* enough space for L2 header */
+       __u8 pad[L2_PAD_SZ];            /* space for L2 header / vxlan header ... */
 } __attribute__((packed));
 
 struct v6hdr {
        struct ipv6hdr ip;
        union l4hdr l4hdr;
-       __u8 pad[16];                   /* enough space for L2 header */
+       __u8 pad[L2_PAD_SZ];            /* space for L2 header / vxlan header ... */
 } __attribute__((packed));
 
 static __always_inline void set_ipv4_csum(struct iphdr *iph)
@@ -69,14 +84,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
        iph->check = ~((csum & 0xffff) + (csum >> 16));
 }
 
-static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
-                                     __u16 l2_proto)
+static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
+                                       __u16 l2_proto, __u16 ext_proto)
 {
        __u16 udp_dst = UDP_PORT;
        struct iphdr iph_inner;
        struct v4hdr h_outer;
        struct tcphdr tcph;
        int olen, l2_len;
+       __u8 *l2_hdr = NULL;
        int tcp_off;
        __u64 flags;
 
@@ -141,7 +157,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
                break;
        case ETH_P_TEB:
                l2_len = ETH_HLEN;
-               udp_dst = ETH_OVER_UDP_PORT;
+               if (ext_proto & EXTPROTO_VXLAN) {
+                       udp_dst = VXLAN_UDP_PORT;
+                       l2_len += sizeof(struct vxlanhdr);
+               } else
+                       udp_dst = ETH_OVER_UDP_PORT;
                break;
        }
        flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
@@ -171,14 +191,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
        }
 
        /* add L2 encap (if specified) */
+       l2_hdr = (__u8 *)&h_outer + olen;
        switch (l2_proto) {
        case ETH_P_MPLS_UC:
-               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+               *(__u32 *)l2_hdr = mpls_label;
                break;
        case ETH_P_TEB:
-               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
-                                      ETH_HLEN))
+               flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
+
+               if (ext_proto & EXTPROTO_VXLAN) {
+                       struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+
+                       vxlan_hdr->vx_flags = VXLAN_FLAGS;
+                       vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+
+                       l2_hdr += sizeof(struct vxlanhdr);
+               }
+
+               if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
                        return TC_ACT_SHOT;
+
                break;
        }
        olen += l2_len;
@@ -214,14 +246,21 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
        return TC_ACT_OK;
 }
 
-static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
                                      __u16 l2_proto)
 {
+       return __encap_ipv4(skb, encap_proto, l2_proto, 0);
+}
+
+static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+                                       __u16 l2_proto, __u16 ext_proto)
+{
        __u16 udp_dst = UDP_PORT;
        struct ipv6hdr iph_inner;
        struct v6hdr h_outer;
        struct tcphdr tcph;
        int olen, l2_len;
+       __u8 *l2_hdr = NULL;
        __u16 tot_len;
        __u64 flags;
 
@@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
                break;
        case ETH_P_TEB:
                l2_len = ETH_HLEN;
-               udp_dst = ETH_OVER_UDP_PORT;
+               if (ext_proto & EXTPROTO_VXLAN) {
+                       udp_dst = VXLAN_UDP_PORT;
+                       l2_len += sizeof(struct vxlanhdr);
+               } else
+                       udp_dst = ETH_OVER_UDP_PORT;
                break;
        }
        flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
@@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
                h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
                h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
                tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
-                         sizeof(h_outer.l4hdr.udp);
+                         sizeof(h_outer.l4hdr.udp) + l2_len;
                h_outer.l4hdr.udp.check = 0;
                h_outer.l4hdr.udp.len = bpf_htons(tot_len);
                break;
@@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
        }
 
        /* add L2 encap (if specified) */
+       l2_hdr = (__u8 *)&h_outer + olen;
        switch (l2_proto) {
        case ETH_P_MPLS_UC:
-               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+               *(__u32 *)l2_hdr = mpls_label;
                break;
        case ETH_P_TEB:
-               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
-                                      ETH_HLEN))
+               flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
+
+               if (ext_proto & EXTPROTO_VXLAN) {
+                       struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+
+                       vxlan_hdr->vx_flags = VXLAN_FLAGS;
+                       vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+
+                       l2_hdr += sizeof(struct vxlanhdr);
+               }
+
+               if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
                        return TC_ACT_SHOT;
                break;
        }
@@ -309,6 +363,12 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
        return TC_ACT_OK;
 }
 
+static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+                                     __u16 l2_proto)
+{
+       return __encap_ipv6(skb, encap_proto, l2_proto, 0);
+}
+
 SEC("encap_ipip_none")
 int __encap_ipip_none(struct __sk_buff *skb)
 {
@@ -372,6 +432,17 @@ int __encap_udp_eth(struct __sk_buff *skb)
                return TC_ACT_OK;
 }
 
+SEC("encap_vxlan_eth")
+int __encap_vxlan_eth(struct __sk_buff *skb)
+{
+       if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+               return __encap_ipv4(skb, IPPROTO_UDP,
+                                   ETH_P_TEB,
+                                   EXTPROTO_VXLAN);
+       else
+               return TC_ACT_OK;
+}
+
 SEC("encap_sit_none")
 int __encap_sit_none(struct __sk_buff *skb)
 {
@@ -444,6 +515,17 @@ int __encap_ip6udp_eth(struct __sk_buff *skb)
                return TC_ACT_OK;
 }
 
+SEC("encap_ip6vxlan_eth")
+int __encap_ip6vxlan_eth(struct __sk_buff *skb)
+{
+       if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+               return __encap_ipv6(skb, IPPROTO_UDP,
+                                   ETH_P_TEB,
+                                   EXTPROTO_VXLAN);
+       else
+               return TC_ACT_OK;
+}
+
 static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 {
        char buf[sizeof(struct v6hdr)];
@@ -479,6 +561,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
                case ETH_OVER_UDP_PORT:
                        olen += ETH_HLEN;
                        break;
+               case VXLAN_UDP_PORT:
+                       olen += ETH_HLEN + sizeof(struct vxlanhdr);
+                       break;
                }
                break;
        default:
index 2db3c60..ac349a5 100755 (executable)
@@ -85,23 +85,6 @@ make_with_tmpdir() {
        echo
 }
 
-make_doc_and_clean() {
-       echo -e "\$PWD:    $PWD"
-       echo -e "command: make -s $* doc >/dev/null"
-       RST2MAN_OPTS="--exit-status=1" make $J -s $* doc
-       if [ $? -ne 0 ] ; then
-               ERROR=1
-               printf "FAILURE: Errors or warnings when building documentation\n"
-       fi
-       (
-               if [ $# -ge 1 ] ; then
-                       cd ${@: -1}
-               fi
-               make -s doc-clean
-       )
-       echo
-}
-
 echo "Trying to build bpftool"
 echo -e "... through kbuild\n"
 
@@ -162,7 +145,3 @@ make_and_clean
 make_with_tmpdir OUTPUT
 
 make_with_tmpdir O
-
-echo -e "Checking documentation build\n"
-# From tools/bpf/bpftool
-make_doc_and_clean
index 2023725..e2394ee 100644 (file)
@@ -66,4 +66,7 @@
 #define BTF_FUNC_ENC(name, func_proto) \
        BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto)
 
+#define BTF_TYPE_FLOAT_ENC(name, sz) \
+       BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz)
+
 #endif /* _TEST_BTF_H */
diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh
new file mode 100755 (executable)
index 0000000..7eb940a
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+# Assume script is located under tools/testing/selftests/bpf/. We want to start
+# build attempts from the top of kernel repository.
+SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0)
+SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH)
+KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../)
+cd $KDIR_ROOT_DIR
+
+for tgt in docs docs-clean; do
+       make -s -C $PWD/$SCRIPT_REL_DIR $tgt;
+done
index f7c2fd8..e87c854 100644 (file)
@@ -152,6 +152,17 @@ extern int test__join_cgroup(const char *path);
        ___ok;                                                          \
 })
 
+#define ASSERT_LT(actual, expected, name) ({                           \
+       static int duration = 0;                                        \
+       typeof(actual) ___act = (actual);                               \
+       typeof(expected) ___exp = (expected);                           \
+       bool ___ok = ___act < ___exp;                                   \
+       CHECK(!___ok, (name),                                           \
+             "unexpected %s: actual %lld >= expected %lld\n",          \
+             (name), (long long)(___act), (long long)(___exp));        \
+       ___ok;                                                          \
+})
+
 #define ASSERT_STREQ(actual, expected, name) ({                                \
        static int duration = 0;                                        \
        const char *___act = actual;                                    \
index 427ca00..eefd445 100644 (file)
@@ -732,7 +732,7 @@ static int sendmsg_test(struct sockmap_options *opt)
                 * socket is not a valid test. So in this case lets not
                 * enable kTLS but still run the test.
                 */
-               if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) {
+               if (!txmsg_redir || txmsg_ingress) {
                        err = sockmap_init_ktls(opt->verbose, rx_fd);
                        if (err)
                                return err;
index 7c76b84..c9dde9b 100755 (executable)
@@ -44,8 +44,8 @@ setup() {
        # clamp route to reserve room for tunnel headers
        ip -netns "${ns1}" -4 route flush table main
        ip -netns "${ns1}" -6 route flush table main
-       ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
-       ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
+       ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1
+       ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1
 
        sleep 1
 
@@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then
        echo "sit"
        $0 ipv6 sit none 100
 
+       echo "ip4 vxlan"
+       $0 ipv4 vxlan eth 2000
+
+       echo "ip6 vxlan"
+       $0 ipv6 ip6vxlan eth 2000
+
        for mac in none mpls eth ; do
                echo "ip gre $mac"
                $0 ipv4 gre $mac 100
@@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then
        targs="encap fou encap-sport auto encap-dport $dport"
 elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
        ttype=$gretaptype
+elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
+       ttype="vxlan"
+       targs="id 1 dstport 8472 udp6zerocsumrx"
 else
        ttype=$tuntype
        targs=""
@@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
 elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
        # No support for TEB fou tunnel; expect failure.
        expect_tun_fail=1
-elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then
        # Share ethernet address between tunnel/veth2 so L2 decap works.
        ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
                  awk '/ether/ { print $2 }')
index 58b5a34..1512092 100644 (file)
@@ -105,7 +105,7 @@ struct bpf_test {
        enum bpf_prog_type prog_type;
        uint8_t flags;
        void (*fill_helper)(struct bpf_test *self);
-       uint8_t runs;
+       int runs;
 #define bpf_testdata_struct_t                                  \
        struct {                                                \
                uint32_t retval, retval_unpriv;                 \
@@ -1165,7 +1165,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 
        run_errs = 0;
        run_successes = 0;
-       if (!alignment_prevented_execution && fd_prog >= 0) {
+       if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) {
                uint32_t expected_val;
                int i;
 
index 88a7483..56d4474 100755 (executable)
 #
 # Run (full output without color-coding):
 #   sudo ./test_xsk.sh
+#
+# Run with verbose output:
+#   sudo ./test_xsk.sh -v
+#
+# Run and dump packet contents:
+#   sudo ./test_xsk.sh -D
 
 . xsk_prereqs.sh
 
-while getopts c flag
+while getopts "cvD" flag
 do
        case "${flag}" in
                c) colorconsole=1;;
+               v) verbose=1;;
+               D) dump_pkts=1;;
        esac
 done
 
@@ -95,13 +103,17 @@ NS1=af_xdp${VETH1_POSTFIX}
 MTU=1500
 
 setup_vethPairs() {
-       echo "setting up ${VETH0}: namespace: ${NS0}"
+       if [[ $verbose -eq 1 ]]; then
+               echo "setting up ${VETH0}: namespace: ${NS0}"
+       fi
        ip netns add ${NS1}
        ip link add ${VETH0} type veth peer name ${VETH1}
        if [ -f /proc/net/if_inet6 ]; then
                echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6
        fi
-       echo "setting up ${VETH1}: namespace: ${NS1}"
+       if [[ $verbose -eq 1 ]]; then
+               echo "setting up ${VETH1}: namespace: ${NS1}"
+       fi
        ip link set ${VETH1} netns ${NS1}
        ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU}
        ip link set ${VETH0} mtu ${MTU}
@@ -125,121 +137,24 @@ echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE}
 
 validate_veth_spec_file
 
-echo "Spec file created: ${SPECFILE}"
-
-test_status $retval "${TEST_NAME}"
-
-## START TESTS
-
-statusList=()
-
-### TEST 1
-TEST_NAME="XSK KSELFTEST FRAMEWORK"
-
-echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode"
-vethXDPgeneric ${VETH0} ${VETH1} ${NS1}
-
-retval=$?
-if [ $retval -eq 0 ]; then
-       echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode"
-       vethXDPnative ${VETH0} ${VETH1} ${NS1}
+if [[ $verbose -eq 1 ]]; then
+        echo "Spec file created: ${SPECFILE}"
+       VERBOSE_ARG="-v"
 fi
 
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
-
-### TEST 2
-TEST_NAME="SKB NOPOLL"
-
-vethXDPgeneric ${VETH0} ${VETH1} ${NS1}
-
-params=("-S")
-execxdpxceiver params
-
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
-
-### TEST 3
-TEST_NAME="SKB POLL"
-
-vethXDPgeneric ${VETH0} ${VETH1} ${NS1}
-
-params=("-S" "-p")
-execxdpxceiver params
-
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
-
-### TEST 4
-TEST_NAME="DRV NOPOLL"
-
-vethXDPnative ${VETH0} ${VETH1} ${NS1}
-
-params=("-N")
-execxdpxceiver params
-
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
-
-### TEST 5
-TEST_NAME="DRV POLL"
-
-vethXDPnative ${VETH0} ${VETH1} ${NS1}
-
-params=("-N" "-p")
-execxdpxceiver params
-
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
-
-### TEST 6
-TEST_NAME="SKB SOCKET TEARDOWN"
-
-vethXDPgeneric ${VETH0} ${VETH1} ${NS1}
-
-params=("-S" "-T")
-execxdpxceiver params
-
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
-
-### TEST 7
-TEST_NAME="DRV SOCKET TEARDOWN"
-
-vethXDPnative ${VETH0} ${VETH1} ${NS1}
-
-params=("-N" "-T")
-execxdpxceiver params
+if [[ $dump_pkts -eq 1 ]]; then
+       DUMP_PKTS_ARG="-D"
+fi
 
-retval=$?
 test_status $retval "${TEST_NAME}"
-statusList+=($retval)
 
-### TEST 8
-TEST_NAME="SKB BIDIRECTIONAL SOCKETS"
-
-vethXDPgeneric ${VETH0} ${VETH1} ${NS1}
-
-params=("-S" "-B")
-execxdpxceiver params
-
-retval=$?
-test_status $retval "${TEST_NAME}"
-statusList+=($retval)
+## START TESTS
 
-### TEST 9
-TEST_NAME="DRV BIDIRECTIONAL SOCKETS"
+statusList=()
 
-vethXDPnative ${VETH0} ${VETH1} ${NS1}
+TEST_NAME="XSK KSELFTESTS"
 
-params=("-N" "-B")
-execxdpxceiver params
+execxdpxceiver
 
 retval=$?
 test_status $retval "${TEST_NAME}"
index fb13ca2..d78627b 100644 (file)
        .result = ACCEPT,
        .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
        .expected_attach_type = BPF_SK_LOOKUP,
+       .runs = -1,
 },
 /* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */
 {
index 26ae8d0..2255489 100755 (executable)
@@ -17,6 +17,9 @@ KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vm
 KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config"
 INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX"
 NUM_COMPILE_JOBS="$(nproc)"
+LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")"
+LOG_FILE="${LOG_FILE_BASE}.log"
+EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status"
 
 usage()
 {
@@ -146,7 +149,6 @@ update_init_script()
        local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d"
        local init_script="${init_script_dir}/S50-startup"
        local command="$1"
-       local log_file="$2"
 
        mount_image
 
@@ -163,11 +165,16 @@ EOF
        sudo bash -c "cat >${init_script}" <<EOF
 #!/bin/bash
 
+# Have a default value in the exit status file
+# incase the VM is forcefully stopped.
+echo "130" > "/root/${EXIT_STATUS_FILE}"
+
 {
        cd /root/bpf
        echo ${command}
        stdbuf -oL -eL ${command}
-} 2>&1 | tee /root/${log_file}
+       echo "\$?" > "/root/${EXIT_STATUS_FILE}"
+} 2>&1 | tee "/root/${LOG_FILE}"
 poweroff -f
 EOF
 
@@ -221,10 +228,12 @@ EOF
 copy_logs()
 {
        local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}"
-       local log_file="${mount_dir}/root/$1"
+       local log_file="${mount_dir}/root/${LOG_FILE}"
+       local exit_status_file="${mount_dir}/root/${EXIT_STATUS_FILE}"
 
        mount_image
        sudo cp ${log_file} "${OUTPUT_DIR}"
+       sudo cp ${exit_status_file} "${OUTPUT_DIR}"
        sudo rm -f ${log_file}
        unmount_image
 }
@@ -263,7 +272,6 @@ main()
 {
        local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
        local kernel_checkout=$(realpath "${script_dir}"/../../../../)
-       local log_file="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S.log")"
        # By default the script searches for the kernel in the checkout directory but
        # it also obeys environment variables O= and KBUILD_OUTPUT=
        local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}"
@@ -347,19 +355,23 @@ main()
        fi
 
        update_selftests "${kernel_checkout}" "${make_command}"
-       update_init_script "${command}" "${log_file}"
+       update_init_script "${command}"
        run_vm "${kernel_bzimage}"
-       copy_logs "${log_file}"
-       echo "Logs saved in ${OUTPUT_DIR}/${log_file}"
+       copy_logs
+       echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}"
 }
 
 catch()
 {
        local exit_code=$1
+       local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}"
        # This is just a cleanup and the directory may
        # have already been unmounted. So, don't let this
        # clobber the error code we intend to return.
        unmount_image || true
+       if [[ -f "${exit_status_file}" ]]; then
+               exit_code="$(cat ${exit_status_file})"
+       fi
        exit ${exit_code}
 }
 
index f4a96d5..8b0f7fd 100644 (file)
  * These selftests test AF_XDP SKB and Native/DRV modes using veth
  * Virtual Ethernet interfaces.
  *
- * The following tests are run:
- *
- * 1. AF_XDP SKB mode
- *    Generic mode XDP is driver independent, used when the driver does
- *    not have support for XDP. Works on any netdevice using sockets and
- *    generic XDP path. XDP hook from netif_receive_skb().
+ * For each mode, the following tests are run:
  *    a. nopoll - soft-irq processing
  *    b. poll - using poll() syscall
  *    c. Socket Teardown
  *       Configure sockets as bi-directional tx/rx sockets, sets up fill and
  *       completion rings on each socket, tx/rx in both directions. Only nopoll
  *       mode is used
+ *    e. Statistics
+ *       Trigger some error conditions and ensure that the appropriate statistics
+ *       are incremented. Within this test, the following statistics are tested:
+ *       i.   rx dropped
+ *            Increase the UMEM frame headroom to a value which results in
+ *            insufficient space in the rx buffer for both the packet and the headroom.
+ *       ii.  tx invalid
+ *            Set the 'len' field of tx descriptors to an invalid value (umem frame
+ *            size + 1).
+ *       iii. rx ring full
+ *            Reduce the size of the RX ring to a fraction of the fill ring size.
+ *       iv.  fill queue empty
+ *            Do not populate the fill queue and then try to receive pkts.
  *
- * 2. AF_XDP DRV/Native mode
- *    Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes
- *    packets before SKB allocation. Provides better performance than SKB. Driver
- *    hook available just after DMA of buffer descriptor.
- *    a. nopoll
- *    b. poll
- *    c. Socket Teardown
- *    d. Bi-directional sockets
- *    - Only copy mode is supported because veth does not currently support
- *      zero-copy mode
- *
- * Total tests: 8
+ * Total tests: 10
  *
  * Flow:
  * -----
@@ -58,7 +55,7 @@
  * - Rx thread verifies if all 10k packets were received and delivered in-order,
  *   and have the right content
  *
- * Enable/disable debug mode:
+ * Enable/disable packet dump mode:
  * --------------------------
  * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add
  * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D")
@@ -98,17 +95,24 @@ typedef __u16 __sum16;
 
 static void __exit_with_error(int error, const char *file, const char *func, int line)
 {
-       ksft_test_result_fail
-           ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error));
-       ksft_exit_xfail();
+       if (configured_mode == TEST_MODE_UNCONFIGURED) {
+               ksft_exit_fail_msg
+               ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error));
+       } else {
+               ksft_test_result_fail
+               ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error));
+               ksft_exit_xfail();
+       }
 }
 
 #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
 
 #define print_ksft_result(void)\
-       (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\
-                              "NOPOLL", opt_teardown ? "Socket Teardown" : "",\
-                              opt_bidi ? "Bi-directional Sockets" : ""))
+       (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\
+                              test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\
+                              test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\
+                              test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\
+                              test_type == TEST_TYPE_STATS ? "Stats" : ""))
 
 static void pthread_init_mutex(void)
 {
@@ -270,13 +274,20 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
 static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size)
 {
        int ret;
+       struct xsk_umem_config cfg = {
+               .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+               .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+               .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+               .frame_headroom = frame_headroom,
+               .flags = XSK_UMEM__DEFAULT_FLAGS
+       };
 
        data->umem = calloc(1, sizeof(struct xsk_umem_info));
        if (!data->umem)
                exit_with_error(errno);
 
        ret = xsk_umem__create(&data->umem->umem, buffer, size,
-                              &data->umem->fq, &data->umem->cq, NULL);
+                              &data->umem->fq, &data->umem->cq, &cfg);
        if (ret)
                exit_with_error(ret);
 
@@ -308,13 +319,13 @@ static int xsk_configure_socket(struct ifobject *ifobject)
                exit_with_error(errno);
 
        ifobject->xsk->umem = ifobject->umem;
-       cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+       cfg.rx_size = rxqsize;
        cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
        cfg.libbpf_flags = 0;
-       cfg.xdp_flags = opt_xdp_flags;
-       cfg.bind_flags = opt_xdp_bind_flags;
+       cfg.xdp_flags = xdp_flags;
+       cfg.bind_flags = xdp_bind_flags;
 
-       if (!opt_bidi) {
+       if (test_type != TEST_TYPE_BIDI) {
                rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL;
                txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL;
        } else {
@@ -334,13 +345,8 @@ static int xsk_configure_socket(struct ifobject *ifobject)
 static struct option long_options[] = {
        {"interface", required_argument, 0, 'i'},
        {"queue", optional_argument, 0, 'q'},
-       {"poll", no_argument, 0, 'p'},
-       {"xdp-skb", no_argument, 0, 'S'},
-       {"xdp-native", no_argument, 0, 'N'},
-       {"copy", no_argument, 0, 'c'},
-       {"tear-down", no_argument, 0, 'T'},
-       {"bidi", optional_argument, 0, 'B'},
-       {"debug", optional_argument, 0, 'D'},
+       {"dump-pkts", optional_argument, 0, 'D'},
+       {"verbose", no_argument, 0, 'v'},
        {"tx-pkt-count", optional_argument, 0, 'C'},
        {0, 0, 0, 0}
 };
@@ -352,13 +358,8 @@ static void usage(const char *prog)
            "  Options:\n"
            "  -i, --interface      Use interface\n"
            "  -q, --queue=n        Use queue n (default 0)\n"
-           "  -p, --poll           Use poll syscall\n"
-           "  -S, --xdp-skb=n      Use XDP SKB mode\n"
-           "  -N, --xdp-native=n   Enforce XDP DRV (native) mode\n"
-           "  -c, --copy           Force copy mode\n"
-           "  -T, --tear-down      Tear down sockets by repeatedly recreating them\n"
-           "  -B, --bidi           Bi-directional sockets test\n"
-           "  -D, --debug          Debug mode - dump packets L2 - L5\n"
+           "  -D, --dump-pkts      Dump packets L2 - L5\n"
+           "  -v, --verbose        Verbose output\n"
            "  -C, --tx-pkt-count=n Number of packets to send\n";
        ksft_print_msg(str, prog);
 }
@@ -392,7 +393,7 @@ static void *nsswitchthread(void *args)
                        ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n",
                                              __func__, ifdict[targs->idx]->ifname);
                } else {
-                       ksft_print_msg("Interface found: %s\n", ifdict[targs->idx]->ifname);
+                       print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname);
                        targs->retptr = true;
                }
        }
@@ -422,7 +423,7 @@ static int validate_interfaces(void)
                        pthread_join(ns_thread, NULL);
 
                        if (targs->retptr)
-                               ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname);
+                               print_verbose("NS switched: %s\n", ifdict[i]->nsname);
 
                        free(targs);
                } else {
@@ -432,7 +433,7 @@ static int validate_interfaces(void)
                                    ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname);
                                ret = false;
                        } else {
-                               ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname);
+                               print_verbose("Interface found: %s\n", ifdict[i]->ifname);
                        }
                }
        }
@@ -446,7 +447,7 @@ static void parse_command_line(int argc, char **argv)
        opterr = 0;
 
        for (;;) {
-               c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index);
+               c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index);
 
                if (c == -1)
                        break;
@@ -469,40 +470,26 @@ static void parse_command_line(int argc, char **argv)
                case 'q':
                        opt_queue = atoi(optarg);
                        break;
-               case 'p':
-                       opt_poll = 1;
-                       break;
-               case 'S':
-                       opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
-                       opt_xdp_bind_flags |= XDP_COPY;
-                       uut = ORDER_CONTENT_VALIDATE_XDP_SKB;
-                       break;
-               case 'N':
-                       opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
-                       opt_xdp_bind_flags |= XDP_COPY;
-                       uut = ORDER_CONTENT_VALIDATE_XDP_DRV;
-                       break;
-               case 'c':
-                       opt_xdp_bind_flags |= XDP_COPY;
-                       break;
-               case 'T':
-                       opt_teardown = 1;
-                       break;
-               case 'B':
-                       opt_bidi = 1;
-                       break;
                case 'D':
                        debug_pkt_dump = 1;
                        break;
                case 'C':
                        opt_pkt_count = atoi(optarg);
                        break;
+               case 'v':
+                       opt_verbose = 1;
+                       break;
                default:
                        usage(basename(argv[0]));
                        ksft_exit_xfail();
                }
        }
 
+       if (!opt_pkt_count) {
+               print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT);
+               opt_pkt_count = DEFAULT_PKT_CNT;
+       }
+
        if (!validate_interfaces()) {
                usage(basename(argv[0]));
                ksft_exit_xfail();
@@ -599,6 +586,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size)
 {
        u32 idx;
        unsigned int i;
+       bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID;
+       u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE;
 
        while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size)
                complete_tx_only(xsk, batch_size);
@@ -607,11 +596,16 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size)
                struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i);
 
                tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
-               tx_desc->len = PKT_SIZE;
+               tx_desc->len = len;
        }
 
        xsk_ring_prod__submit(&xsk->tx, batch_size);
-       xsk->outstanding_tx += batch_size;
+       if (!tx_invalid_test) {
+               xsk->outstanding_tx += batch_size;
+       } else {
+               if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx))
+                       kick_tx(xsk);
+       }
        *frameptr += batch_size;
        *frameptr %= num_frames;
        complete_tx_only(xsk, batch_size);
@@ -654,7 +648,7 @@ static void tx_only_all(struct ifobject *ifobject)
        while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
                int batch_size = get_batch_size(pkt_cnt);
 
-               if (opt_poll) {
+               if (test_type == TEST_TYPE_POLL) {
                        ret = poll(fds, 1, POLL_TMOUT);
                        if (ret <= 0)
                                continue;
@@ -714,7 +708,7 @@ static void worker_pkt_dump(void)
                int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE));
 
                if (payload == EOT) {
-                       ksft_print_msg("End-of-transmission frame received\n");
+                       print_verbose("End-of-transmission frame received\n");
                        fprintf(stdout, "---------------------------------------\n");
                        break;
                }
@@ -723,6 +717,48 @@ static void worker_pkt_dump(void)
        }
 }
 
+static void worker_stats_validate(struct ifobject *ifobject)
+{
+       struct xdp_statistics stats;
+       socklen_t optlen;
+       int err;
+       struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ?
+                                                       ifdict[!ifobject->ifdict_index]->xsk->xsk :
+                                                       ifobject->xsk->xsk;
+       int fd = xsk_socket__fd(xsk);
+       unsigned long xsk_stat = 0, expected_stat = opt_pkt_count;
+
+       sigvar = 0;
+
+       optlen = sizeof(stats);
+       err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
+       if (err)
+               return;
+
+       if (optlen == sizeof(struct xdp_statistics)) {
+               switch (stat_test_type) {
+               case STAT_TEST_RX_DROPPED:
+                       xsk_stat = stats.rx_dropped;
+                       break;
+               case STAT_TEST_TX_INVALID:
+                       xsk_stat = stats.tx_invalid_descs;
+                       break;
+               case STAT_TEST_RX_FULL:
+                       xsk_stat = stats.rx_ring_full;
+                       expected_stat -= RX_FULL_RXQSIZE;
+                       break;
+               case STAT_TEST_RX_FILL_EMPTY:
+                       xsk_stat = stats.rx_fill_ring_empty_descs;
+                       break;
+               default:
+                       break;
+               }
+
+               if (xsk_stat == expected_stat)
+                       sigvar = 1;
+       }
+}
+
 static void worker_pkt_validate(void)
 {
        u32 payloadseqnum = -2;
@@ -746,7 +782,7 @@ static void worker_pkt_validate(void)
                        }
 
                        if (payloadseqnum == EOT) {
-                               ksft_print_msg("End-of-transmission frame received: PASS\n");
+                               print_verbose("End-of-transmission frame received: PASS\n");
                                sigvar = 1;
                                break;
                        }
@@ -836,7 +872,7 @@ static void *worker_testapp_validate(void *arg)
                        usleep(USLEEP_MAX);
                }
 
-               ksft_print_msg("Interface [%s] vector [Tx]\n", ifobject->ifname);
+               print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname);
                for (int i = 0; i < num_frames; i++) {
                        /*send EOT frame */
                        if (i == (num_frames - 1))
@@ -850,7 +886,7 @@ static void *worker_testapp_validate(void *arg)
                        gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE);
                }
 
-               ksft_print_msg("Sending %d packets on interface %s\n",
+               print_verbose("Sending %d packets on interface %s\n",
                               (opt_pkt_count - 1), ifobject->ifname);
                tx_only_all(ifobject);
        } else if (ifobject->fv.vector == rx) {
@@ -860,8 +896,9 @@ static void *worker_testapp_validate(void *arg)
                if (!bidi_pass)
                        thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx);
 
-               ksft_print_msg("Interface [%s] vector [Rx]\n", ifobject->ifname);
-               xsk_populate_fill_ring(ifobject->umem);
+               print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname);
+               if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
+                       xsk_populate_fill_ring(ifobject->umem);
 
                TAILQ_INIT(&head);
                if (debug_pkt_dump) {
@@ -878,26 +915,32 @@ static void *worker_testapp_validate(void *arg)
                pthread_mutex_unlock(&sync_mutex);
 
                while (1) {
-                       if (opt_poll) {
+                       if (test_type == TEST_TYPE_POLL) {
                                ret = poll(fds, 1, POLL_TMOUT);
                                if (ret <= 0)
                                        continue;
                        }
-                       rx_pkt(ifobject->xsk, fds);
-                       worker_pkt_validate();
+
+                       if (test_type != TEST_TYPE_STATS) {
+                               rx_pkt(ifobject->xsk, fds);
+                               worker_pkt_validate();
+                       } else {
+                               worker_stats_validate(ifobject);
+                       }
 
                        if (sigvar)
                                break;
                }
 
-               ksft_print_msg("Received %d packets on interface %s\n",
-                              pkt_counter, ifobject->ifname);
+               if (test_type != TEST_TYPE_STATS)
+                       print_verbose("Received %d packets on interface %s\n",
+                               pkt_counter, ifobject->ifname);
 
-               if (opt_teardown)
-                       ksft_print_msg("Destroying socket\n");
+               if (test_type == TEST_TYPE_TEARDOWN)
+                       print_verbose("Destroying socket\n");
        }
 
-       if (!opt_bidi || bidi_pass) {
+       if ((test_type != TEST_TYPE_BIDI) || bidi_pass) {
                xsk_socket__delete(ifobject->xsk->xsk);
                (void)xsk_umem__delete(ifobject->umem->umem);
        }
@@ -907,14 +950,15 @@ static void *worker_testapp_validate(void *arg)
 static void testapp_validate(void)
 {
        struct timespec max_wait = { 0, 0 };
+       bool bidi = test_type == TEST_TYPE_BIDI;
 
        pthread_attr_init(&attr);
        pthread_attr_setstacksize(&attr, THREAD_STACK);
 
-       if (opt_bidi && bidi_pass) {
+       if ((test_type == TEST_TYPE_BIDI) && bidi_pass) {
                pthread_init_mutex();
                if (!switching_notify) {
-                       ksft_print_msg("Switching Tx/Rx vectors\n");
+                       print_verbose("Switching Tx/Rx vectors\n");
                        switching_notify++;
                }
        }
@@ -922,10 +966,10 @@ static void testapp_validate(void)
        pthread_mutex_lock(&sync_mutex);
 
        /*Spawn RX thread */
-       if (!opt_bidi || !bidi_pass) {
+       if (!bidi || !bidi_pass) {
                if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1]))
                        exit_with_error(errno);
-       } else if (opt_bidi && bidi_pass) {
+       } else if (bidi && bidi_pass) {
                /*switch Tx/Rx vectors */
                ifdict[0]->fv.vector = rx;
                if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0]))
@@ -942,10 +986,10 @@ static void testapp_validate(void)
        pthread_mutex_unlock(&sync_mutex);
 
        /*Spawn TX thread */
-       if (!opt_bidi || !bidi_pass) {
+       if (!bidi || !bidi_pass) {
                if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0]))
                        exit_with_error(errno);
-       } else if (opt_bidi && bidi_pass) {
+       } else if (bidi && bidi_pass) {
                /*switch Tx/Rx vectors */
                ifdict[1]->fv.vector = tx;
                if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1]))
@@ -964,19 +1008,46 @@ static void testapp_validate(void)
                free(pkt_buf);
        }
 
-       if (!opt_teardown && !opt_bidi)
+       if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS))
                print_ksft_result();
 }
 
 static void testapp_sockets(void)
 {
-       for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) {
+       for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER);
+            i++) {
                pkt_counter = 0;
                prev_pkt = -1;
                sigvar = 0;
-               ksft_print_msg("Creating socket\n");
+               print_verbose("Creating socket\n");
+               testapp_validate();
+               test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass;
+       }
+
+       print_ksft_result();
+}
+
+static void testapp_stats(void)
+{
+       for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) {
+               stat_test_type = i;
+
+               /* reset defaults */
+               rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+               frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+
+               switch (stat_test_type) {
+               case STAT_TEST_RX_DROPPED:
+                       frame_headroom = XSK_UMEM__DEFAULT_FRAME_SIZE -
+                                               XDP_PACKET_HEADROOM - 1;
+                       break;
+               case STAT_TEST_RX_FULL:
+                       rxqsize = RX_FULL_RXQSIZE;
+                       break;
+               default:
+                       break;
+               }
                testapp_validate();
-               opt_bidi ? bidi_pass++ : bidi_pass;
        }
 
        print_ksft_result();
@@ -1003,6 +1074,104 @@ static void init_iface_config(struct ifaceconfigobj *ifaceconfig)
        ifdict[1]->src_port = ifaceconfig->dst_port;
 }
 
+static void *nsdisablemodethread(void *args)
+{
+       struct targs *targs = args;
+
+       targs->retptr = false;
+
+       if (switch_namespace(targs->idx)) {
+               targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags);
+       } else {
+               targs->retptr = errno;
+               print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname);
+       }
+
+       pthread_exit(NULL);
+}
+
+static void disable_xdp_mode(int mode)
+{
+       int err = 0;
+       __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode;
+       char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv";
+
+       for (int i = 0; i < MAX_INTERFACES; i++) {
+               if (strcmp(ifdict[i]->nsname, "")) {
+                       struct targs *targs;
+
+                       targs = malloc(sizeof(*targs));
+                       memset(targs, 0, sizeof(*targs));
+                       if (!targs)
+                               exit_with_error(errno);
+
+                       targs->idx = i;
+                       targs->flags = flags;
+                       if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs))
+                               exit_with_error(errno);
+
+                       pthread_join(ns_thread, NULL);
+                       err = targs->retptr;
+                       free(targs);
+               } else {
+                       err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags);
+               }
+
+               if (err) {
+                       print_verbose("Failed to disable %s mode on interface %s\n",
+                                               mode_str, ifdict[i]->ifname);
+                       exit_with_error(err);
+               }
+
+               print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname);
+               configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB;
+       }
+}
+
+static void run_pkt_test(int mode, int type)
+{
+       test_type = type;
+
+       /* reset defaults after potential previous test */
+       xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+       pkt_counter = 0;
+       switching_notify = 0;
+       bidi_pass = 0;
+       prev_pkt = -1;
+       ifdict[0]->fv.vector = tx;
+       ifdict[1]->fv.vector = rx;
+       sigvar = 0;
+       stat_test_type = -1;
+       rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+       frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+
+       switch (mode) {
+       case (TEST_MODE_SKB):
+               if (configured_mode == TEST_MODE_DRV)
+                       disable_xdp_mode(XDP_FLAGS_DRV_MODE);
+               xdp_flags |= XDP_FLAGS_SKB_MODE;
+               break;
+       case (TEST_MODE_DRV):
+               if (configured_mode == TEST_MODE_SKB)
+                       disable_xdp_mode(XDP_FLAGS_SKB_MODE);
+               xdp_flags |= XDP_FLAGS_DRV_MODE;
+               break;
+       default:
+               break;
+       }
+
+       pthread_init_mutex();
+
+       if (test_type == TEST_TYPE_STATS)
+               testapp_stats();
+       else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI))
+               testapp_validate();
+       else
+               testapp_sockets();
+
+       pthread_destroy_mutex();
+}
+
 int main(int argc, char **argv)
 {
        struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY };
@@ -1016,6 +1185,7 @@ int main(int argc, char **argv)
        const char *IP2 = "192.168.100.161";
        u16 UDP_DST_PORT = 2020;
        u16 UDP_SRC_PORT = 2121;
+       int i, j;
 
        ifaceconfig = malloc(sizeof(struct ifaceconfigobj));
        memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN);
@@ -1041,24 +1211,18 @@ int main(int argc, char **argv)
 
        init_iface_config(ifaceconfig);
 
-       pthread_init_mutex();
+       disable_xdp_mode(XDP_FLAGS_DRV_MODE);
 
-       ksft_set_plan(1);
+       ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
 
-       if (!opt_teardown && !opt_bidi) {
-               testapp_validate();
-       } else if (opt_teardown && opt_bidi) {
-               ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n");
-               ksft_exit_xfail();
-       } else {
-               testapp_sockets();
+       for (i = 0; i < TEST_MODE_MAX; i++) {
+               for (j = 0; j < TEST_TYPE_MAX; j++)
+                       run_pkt_test(i, j);
        }
 
        for (int i = 0; i < MAX_INTERFACES; i++)
                free(ifdict[i]);
 
-       pthread_destroy_mutex();
-
        ksft_exit_pass();
 
        return 0;
index 0e9f9b7..30314ef 100644 (file)
 #define BATCH_SIZE 64
 #define POLL_TMOUT 1000
 #define NEED_WAKEUP true
+#define DEFAULT_PKT_CNT 10000
+#define RX_FULL_RXQSIZE 32
+
+#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0)
 
 typedef __u32 u32;
 typedef __u16 u16;
 typedef __u8 u8;
 
-enum TESTS {
-       ORDER_CONTENT_VALIDATE_XDP_SKB = 0,
-       ORDER_CONTENT_VALIDATE_XDP_DRV = 1,
+enum TEST_MODES {
+       TEST_MODE_UNCONFIGURED = -1,
+       TEST_MODE_SKB,
+       TEST_MODE_DRV,
+       TEST_MODE_MAX
+};
+
+enum TEST_TYPES {
+       TEST_TYPE_NOPOLL,
+       TEST_TYPE_POLL,
+       TEST_TYPE_TEARDOWN,
+       TEST_TYPE_BIDI,
+       TEST_TYPE_STATS,
+       TEST_TYPE_MAX
 };
 
-u8 uut;
-u8 debug_pkt_dump;
-u32 num_frames;
-u8 switching_notify;
-u8 bidi_pass;
+enum STAT_TEST_TYPES {
+       STAT_TEST_RX_DROPPED,
+       STAT_TEST_TX_INVALID,
+       STAT_TEST_RX_FULL,
+       STAT_TEST_RX_FILL_EMPTY,
+       STAT_TEST_TYPE_MAX
+};
+
+static int configured_mode = TEST_MODE_UNCONFIGURED;
+static u8 debug_pkt_dump;
+static u32 num_frames;
+static u8 switching_notify;
+static u8 bidi_pass;
+static int test_type;
 
-static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 static int opt_queue;
 static int opt_pkt_count;
-static int opt_poll;
-static int opt_teardown;
-static int opt_bidi;
-static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
+static u8 opt_verbose;
+
+static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY;
 static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
 static u32 pkt_counter;
-static u32 prev_pkt = -1;
+static long prev_pkt = -1;
 static int sigvar;
+static int stat_test_type;
+static u32 rxqsize;
+static u32 frame_headroom;
 
 struct xsk_umem_info {
        struct xsk_ring_prod fq;
@@ -137,8 +163,9 @@ pthread_t t0, t1, ns_thread;
 pthread_attr_t attr;
 
 struct targs {
-       bool retptr;
+       u8 retptr;
        int idx;
+       u32 flags;
 };
 
 TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head);
index 9d54c46..dac1c5f 100755 (executable)
@@ -82,24 +82,21 @@ clear_configs()
 {
        if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then
                [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] &&
-                       { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; }
-               echo "removing ns $3"
+                       { ip netns exec $3 ip link del $2; }
                ip netns del $3
        fi
        #Once we delete a veth pair node, the entire veth pair is removed,
        #this is just to be cautious just incase the NS does not exist then
        #veth node inside NS won't get removed so we explicitly remove it
        [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] &&
-               { echo "removing link $1"; ip link del $1; }
+               { ip link del $1; }
        if [ -f ${SPECFILE} ]; then
-               echo "removing spec file:" ${SPECFILE}
                rm -f ${SPECFILE}
        fi
 }
 
 cleanup_exit()
 {
-       echo "cleaning up..."
        clear_configs $1 $2 $3
 }
 
@@ -108,28 +105,7 @@ validate_ip_utility()
        [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; }
 }
 
-vethXDPgeneric()
-{
-       ip link set dev $1 xdpdrv off
-       ip netns exec $3 ip link set dev $2 xdpdrv off
-}
-
-vethXDPnative()
-{
-       ip link set dev $1 xdpgeneric off
-       ip netns exec $3 ip link set dev $2 xdpgeneric off
-}
-
 execxdpxceiver()
 {
-       local -a 'paramkeys=("${!'"$1"'[@]}")' copy
-       paramkeysstr=${paramkeys[*]}
-
-       for index in $paramkeysstr;
-               do
-                       current=$1"[$index]"
-                       copy[$index]=${!current}
-               done
-
-       ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS}
+       ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG}
 }