Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorJakub Kicinski <kuba@kernel.org>
Mon, 30 Aug 2021 23:42:46 +0000 (16:42 -0700)
committerJakub Kicinski <kuba@kernel.org>
Mon, 30 Aug 2021 23:42:47 +0000 (16:42 -0700)
Daniel Borkmann says:

====================
bpf-next 2021-08-31

We've added 116 non-merge commits during the last 17 day(s) which contain
a total of 126 files changed, 6813 insertions(+), 4027 deletions(-).

The main changes are:

1) Add opaque bpf_cookie to perf link which the program can read out again,
   to be used in libbpf-based USDT library, from Andrii Nakryiko.

2) Add bpf_task_pt_regs() helper to access userspace pt_regs, from Daniel Xu.

3) Add support for UNIX stream type sockets for BPF sockmap, from Jiang Wang.

4) Allow BPF TCP congestion control progs to call bpf_setsockopt() e.g. to switch
   to another congestion control algorithm during init, from Martin KaFai Lau.

5) Extend BPF iterator support for UNIX domain sockets, from Kuniyuki Iwashima.

6) Allow bpf_{set,get}sockopt() calls from setsockopt progs, from Prankur Gupta.

7) Add bpf_get_netns_cookie() helper for BPF_PROG_TYPE_{SOCK_OPS,CGROUP_SOCKOPT}
   progs, from Xu Liu and Stanislav Fomichev.

8) Support for __weak typed ksyms in libbpf, from Hao Luo.

9) Shrink struct cgroup_bpf by 504 bytes through refactoring, from Dave Marchevsky.

10) Fix a smatch complaint in verifier's narrow load handling, from Andrey Ignatov.

11) Fix BPF interpreter's tail call count limit, from Daniel Borkmann.

12) Big batch of improvements to BPF selftests, from Magnus Karlsson, Li Zhijian,
    Yucong Sun, Yonghong Song, Ilya Leoshkevich, Jussi Maki, Ilya Leoshkevich, others.

13) Another big batch to revamp XDP samples in order to give them consistent look
    and feel, from Kumar Kartikeya Dwivedi.

* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (116 commits)
  MAINTAINERS: Remove self from powerpc BPF JIT
  selftests/bpf: Fix potential unreleased lock
  samples: bpf: Fix uninitialized variable in xdp_redirect_cpu
  selftests/bpf: Reduce more flakyness in sockmap_listen
  bpf: Fix bpf-next builds without CONFIG_BPF_EVENTS
  bpf: selftests: Add dctcp fallback test
  bpf: selftests: Add connect_to_fd_opts to network_helpers
  bpf: selftests: Add sk_state to bpf_tcp_helpers.h
  bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
  selftests: xsk: Preface options with opt
  selftests: xsk: Make enums lower case
  selftests: xsk: Generate packets from specification
  selftests: xsk: Generate packet directly in umem
  selftests: xsk: Simplify cleanup of ifobjects
  selftests: xsk: Decrease sending speed
  selftests: xsk: Validate tx stats on tx thread
  selftests: xsk: Simplify packet validation in xsk tests
  selftests: xsk: Rename worker_* functions that are not thread entry points
  selftests: xsk: Disassociate umem size with packets sent
  selftests: xsk: Remove end-of-test packet
  ...
====================

Link: https://lore.kernel.org/r/20210830225618.11634-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
126 files changed:
Documentation/bpf/index.rst
Documentation/bpf/libbpf/index.rst [moved from Documentation/bpf/libbpf/libbpf.rst with 75% similarity]
Documentation/bpf/libbpf/libbpf_api.rst [deleted file]
Documentation/bpf/libbpf/libbpf_naming_convention.rst
Documentation/networking/filter.rst
MAINTAINERS
drivers/media/rc/bpf-lirc.c
drivers/net/ppp/ppp_generic.c
drivers/net/team/team_mode_loadbalance.c
include/linux/bpf-cgroup.h
include/linux/bpf.h
include/linux/bpf_types.h
include/linux/bpfptr.h
include/linux/btf_ids.h
include/linux/filter.h
include/linux/perf_event.h
include/linux/trace_events.h
include/net/af_unix.h
include/uapi/linux/bpf.h
kernel/bpf/bpf_iter.c
kernel/bpf/bpf_struct_ops.c
kernel/bpf/bpf_task_storage.c
kernel/bpf/btf.c
kernel/bpf/cgroup.c
kernel/bpf/core.c
kernel/bpf/helpers.c
kernel/bpf/stackmap.c
kernel/bpf/syscall.c
kernel/bpf/task_iter.c
kernel/bpf/trampoline.c
kernel/bpf/verifier.c
kernel/events/core.c
kernel/trace/bpf_trace.c
lib/test_bpf.c
net/bpf/test_run.c
net/core/filter.c
net/core/ptp_classifier.c
net/core/sock_map.c
net/ipv4/af_inet.c
net/ipv4/bpf_tcp_ca.c
net/ipv4/udp.c
net/ipv6/af_inet6.c
net/ipv6/udp.c
net/netfilter/xt_bpf.c
net/sched/act_bpf.c
net/sched/cls_bpf.c
net/unix/af_unix.c
net/unix/unix_bpf.c
samples/bpf/Makefile
samples/bpf/Makefile.target
samples/bpf/cookie_uid_helper_example.c
samples/bpf/offwaketime_kern.c
samples/bpf/tracex4_user.c
samples/bpf/xdp_monitor.bpf.c [new file with mode: 0644]
samples/bpf/xdp_monitor_kern.c [deleted file]
samples/bpf/xdp_monitor_user.c
samples/bpf/xdp_redirect.bpf.c [new file with mode: 0644]
samples/bpf/xdp_redirect_cpu.bpf.c [moved from samples/bpf/xdp_redirect_cpu_kern.c with 52% similarity]
samples/bpf/xdp_redirect_cpu_user.c
samples/bpf/xdp_redirect_kern.c [deleted file]
samples/bpf/xdp_redirect_map.bpf.c [moved from samples/bpf/xdp_redirect_map_kern.c with 57% similarity]
samples/bpf/xdp_redirect_map_multi.bpf.c [moved from samples/bpf/xdp_redirect_map_multi_kern.c with 64% similarity]
samples/bpf/xdp_redirect_map_multi_user.c
samples/bpf/xdp_redirect_map_user.c
samples/bpf/xdp_redirect_user.c
samples/bpf/xdp_sample.bpf.c [new file with mode: 0644]
samples/bpf/xdp_sample.bpf.h [new file with mode: 0644]
samples/bpf/xdp_sample_shared.h [new file with mode: 0644]
samples/bpf/xdp_sample_user.c [new file with mode: 0644]
samples/bpf/xdp_sample_user.h [new file with mode: 0644]
tools/include/uapi/linux/bpf.h
tools/include/uapi/linux/ethtool.h
tools/lib/bpf/Makefile
tools/lib/bpf/bpf.c
tools/lib/bpf/bpf.h
tools/lib/bpf/libbpf.c
tools/lib/bpf/libbpf.h
tools/lib/bpf/libbpf.map
tools/lib/bpf/libbpf_internal.h
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bpf_tcp_helpers.h
tools/testing/selftests/bpf/network_helpers.c
tools/testing/selftests/bpf/network_helpers.h
tools/testing/selftests/bpf/prog_tests/attach_probe.c
tools/testing/selftests/bpf/prog_tests/bpf_cookie.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/bpf_iter.c
tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
tools/testing/selftests/bpf/prog_tests/btf_module.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/kfunc_call.c
tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
tools/testing/selftests/bpf/prog_tests/netns_cookie.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/perf_link.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/send_signal.c
tools/testing/selftests/bpf/prog_tests/snprintf.c
tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/task_pt_regs.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/timer_mim.c
tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
tools/testing/selftests/bpf/progs/bpf_dctcp.c
tools/testing/selftests/bpf/progs/bpf_dctcp_release.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_iter.h
tools/testing/selftests/bpf/progs/bpf_iter_unix.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_tracing_net.h
tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
tools/testing/selftests/bpf/progs/netns_cookie_prog.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/sockopt_sk.c
tools/testing/selftests/bpf/progs/test_bpf_cookie.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_core_autosize.c
tools/testing/selftests/bpf/progs/test_ksyms_weak.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_perf_link.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_snprintf.c
tools/testing/selftests/bpf/progs/test_task_pt_regs.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_bpftool.sh
tools/testing/selftests/bpf/test_bpftool_build.sh
tools/testing/selftests/bpf/test_doc_build.sh
tools/testing/selftests/bpf/test_maps.c
tools/testing/selftests/bpf/test_progs.c
tools/testing/selftests/bpf/test_xsk.sh
tools/testing/selftests/bpf/trace_helpers.c
tools/testing/selftests/bpf/trace_helpers.h
tools/testing/selftests/bpf/xdpxceiver.c
tools/testing/selftests/bpf/xdpxceiver.h
tools/testing/selftests/bpf/xsk_prereqs.sh

index baea6c2..1ceb5d7 100644 (file)
@@ -15,15 +15,7 @@ that goes into great technical depth about the BPF Architecture.
 libbpf
 ======
 
-Libbpf is a userspace library for loading and interacting with bpf programs.
-
-.. toctree::
-   :maxdepth: 1
-
-   libbpf/libbpf
-   libbpf/libbpf_api
-   libbpf/libbpf_build
-   libbpf/libbpf_naming_convention
+Documentation/bpf/libbpf/libbpf.rst is a userspace library for loading and interacting with bpf programs.
 
 BPF Type Format (BTF)
 =====================
similarity index 75%
rename from Documentation/bpf/libbpf/libbpf.rst
rename to Documentation/bpf/libbpf/index.rst
index 1b1e61d..4f8adfc 100644 (file)
@@ -3,6 +3,14 @@
 libbpf
 ======
 
+For API documentation see the `versioned API documentation site <https://libbpf.readthedocs.io/en/latest/api.html>`_.
+
+.. toctree::
+   :maxdepth: 1
+
+   libbpf_naming_convention
+   libbpf_build
+
 This is documentation for libbpf, a userspace library for loading and
 interacting with bpf programs.
 
diff --git a/Documentation/bpf/libbpf/libbpf_api.rst b/Documentation/bpf/libbpf/libbpf_api.rst
deleted file mode 100644 (file)
index f07eecd..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-
-API
-===
-
-This documentation is autogenerated from header files in libbpf, tools/lib/bpf
-
-.. kernel-doc:: tools/lib/bpf/libbpf.h
-   :internal:
-
-.. kernel-doc:: tools/lib/bpf/bpf.h
-   :internal:
-
-.. kernel-doc:: tools/lib/bpf/btf.h
-   :internal:
-
-.. kernel-doc:: tools/lib/bpf/xsk.h
-   :internal:
-
-.. kernel-doc:: tools/lib/bpf/bpf_tracing.h
-   :internal:
-
-.. kernel-doc:: tools/lib/bpf/bpf_core_read.h
-   :internal:
-
-.. kernel-doc:: tools/lib/bpf/bpf_endian.h
-   :internal:
\ No newline at end of file
index 6bf9c5a..9c68d50 100644 (file)
@@ -69,7 +69,7 @@ functions. These can be mixed and matched. Note that these functions
 are not reentrant for performance reasons.
 
 ABI
-==========
+---
 
 libbpf can be both linked statically or used as DSO. To avoid possible
 conflicts with other libraries an application is linked with, all
index 5f13905..ce2b8e8 100644 (file)
@@ -638,8 +638,8 @@ extension, PTP dissector/classifier, and much more. They are all internally
 converted by the kernel into the new instruction set representation and run
 in the eBPF interpreter. For in-kernel handlers, this all works transparently
 by using bpf_prog_create() for setting up the filter, resp.
-bpf_prog_destroy() for destroying it. The macro
-BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed
+bpf_prog_destroy() for destroying it. The function
+bpf_prog_run(filter, ctx) transparently invokes eBPF interpreter or JITed
 code to run the filter. 'filter' is a pointer to struct bpf_prog that we
 got from bpf_prog_create(), and 'ctx' the given context (e.g.
 skb pointer). All constraints and restrictions from bpf_check_classic() apply
index 6abfd3e..2f12abc 100644 (file)
@@ -3409,7 +3409,6 @@ F:        drivers/net/ethernet/netronome/nfp/bpf/
 
 BPF JIT for POWERPC (32-BIT AND 64-BIT)
 M:     Naveen N. Rao <naveen.n.rao@linux.ibm.com>
-M:     Sandipan Das <sandipan@linux.ibm.com>
 L:     netdev@vger.kernel.org
 L:     bpf@vger.kernel.org
 S:     Maintained
index afae0af..3eff08d 100644 (file)
@@ -160,7 +160,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
                goto unlock;
        }
 
-       ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+       ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array);
        if (ret < 0)
                goto unlock;
 
@@ -193,7 +193,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
        }
 
        old_array = lirc_rcu_dereference(raw->progs);
-       ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+       ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array);
        /*
         * Do not use bpf_prog_array_delete_safe() as we would end up
         * with a dummy entry in the array, and the we would free the
@@ -217,7 +217,7 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
        raw->bpf_sample = sample;
 
        if (raw->progs)
-               BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN);
+               BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run);
 }
 
 /*
index e9e8157..fb52cd1 100644 (file)
@@ -1744,7 +1744,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
                   a four-byte PPP header on each packet */
                *(u8 *)skb_push(skb, 2) = 1;
                if (ppp->pass_filter &&
-                   BPF_PROG_RUN(ppp->pass_filter, skb) == 0) {
+                   bpf_prog_run(ppp->pass_filter, skb) == 0) {
                        if (ppp->debug & 1)
                                netdev_printk(KERN_DEBUG, ppp->dev,
                                              "PPP: outbound frame "
@@ -1754,7 +1754,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
                }
                /* if this packet passes the active filter, record the time */
                if (!(ppp->active_filter &&
-                     BPF_PROG_RUN(ppp->active_filter, skb) == 0))
+                     bpf_prog_run(ppp->active_filter, skb) == 0))
                        ppp->last_xmit = jiffies;
                skb_pull(skb, 2);
 #else
@@ -2468,7 +2468,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 
                        *(u8 *)skb_push(skb, 2) = 0;
                        if (ppp->pass_filter &&
-                           BPF_PROG_RUN(ppp->pass_filter, skb) == 0) {
+                           bpf_prog_run(ppp->pass_filter, skb) == 0) {
                                if (ppp->debug & 1)
                                        netdev_printk(KERN_DEBUG, ppp->dev,
                                                      "PPP: inbound frame "
@@ -2477,7 +2477,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
                                return;
                        }
                        if (!(ppp->active_filter &&
-                             BPF_PROG_RUN(ppp->active_filter, skb) == 0))
+                             bpf_prog_run(ppp->active_filter, skb) == 0))
                                ppp->last_recv = jiffies;
                        __skb_pull(skb, 2);
                } else
index 32aef8a..b095a4b 100644 (file)
@@ -197,7 +197,7 @@ static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv,
        fp = rcu_dereference_bh(lb_priv->fp);
        if (unlikely(!fp))
                return 0;
-       lhash = BPF_PROG_RUN(fp, skb);
+       lhash = bpf_prog_run(fp, skb);
        c = (char *) &lhash;
        return c[0] ^ c[1] ^ c[2] ^ c[3];
 }
index a74cd1c..2746fd8 100644 (file)
@@ -23,9 +23,73 @@ struct ctl_table_header;
 struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
+enum cgroup_bpf_attach_type {
+       CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
+       CGROUP_INET_INGRESS = 0,
+       CGROUP_INET_EGRESS,
+       CGROUP_INET_SOCK_CREATE,
+       CGROUP_SOCK_OPS,
+       CGROUP_DEVICE,
+       CGROUP_INET4_BIND,
+       CGROUP_INET6_BIND,
+       CGROUP_INET4_CONNECT,
+       CGROUP_INET6_CONNECT,
+       CGROUP_INET4_POST_BIND,
+       CGROUP_INET6_POST_BIND,
+       CGROUP_UDP4_SENDMSG,
+       CGROUP_UDP6_SENDMSG,
+       CGROUP_SYSCTL,
+       CGROUP_UDP4_RECVMSG,
+       CGROUP_UDP6_RECVMSG,
+       CGROUP_GETSOCKOPT,
+       CGROUP_SETSOCKOPT,
+       CGROUP_INET4_GETPEERNAME,
+       CGROUP_INET6_GETPEERNAME,
+       CGROUP_INET4_GETSOCKNAME,
+       CGROUP_INET6_GETSOCKNAME,
+       CGROUP_INET_SOCK_RELEASE,
+       MAX_CGROUP_BPF_ATTACH_TYPE
+};
+
+#define CGROUP_ATYPE(type) \
+       case BPF_##type: return type
+
+static inline enum cgroup_bpf_attach_type
+to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
+{
+       switch (attach_type) {
+       CGROUP_ATYPE(CGROUP_INET_INGRESS);
+       CGROUP_ATYPE(CGROUP_INET_EGRESS);
+       CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE);
+       CGROUP_ATYPE(CGROUP_SOCK_OPS);
+       CGROUP_ATYPE(CGROUP_DEVICE);
+       CGROUP_ATYPE(CGROUP_INET4_BIND);
+       CGROUP_ATYPE(CGROUP_INET6_BIND);
+       CGROUP_ATYPE(CGROUP_INET4_CONNECT);
+       CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+       CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
+       CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
+       CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
+       CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+       CGROUP_ATYPE(CGROUP_SYSCTL);
+       CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
+       CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+       CGROUP_ATYPE(CGROUP_GETSOCKOPT);
+       CGROUP_ATYPE(CGROUP_SETSOCKOPT);
+       CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
+       CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+       CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
+       CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+       CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
+       default:
+               return CGROUP_BPF_ATTACH_TYPE_INVALID;
+       }
+}
+
+#undef CGROUP_ATYPE
 
-extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
-#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
+extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE];
+#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype])
 
 #define for_each_cgroup_storage_type(stype) \
        for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -67,15 +131,15 @@ struct bpf_prog_array;
 
 struct cgroup_bpf {
        /* array of effective progs in this cgroup */
-       struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+       struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
 
        /* attached progs to this cgroup and attach flags
         * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
         * have either zero or one element
         * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
         */
-       struct list_head progs[MAX_BPF_ATTACH_TYPE];
-       u32 flags[MAX_BPF_ATTACH_TYPE];
+       struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+       u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
 
        /* list of cgroup shared storages */
        struct list_head storages;
@@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
                                struct sk_buff *skb,
-                               enum bpf_attach_type type);
+                               enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
-                              enum bpf_attach_type type);
+                              enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
                                      struct sockaddr *uaddr,
-                                     enum bpf_attach_type type,
+                                     enum cgroup_bpf_attach_type atype,
                                      void *t_ctx,
                                      u32 *flags);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
                                     struct bpf_sock_ops_kern *sock_ops,
-                                    enum bpf_attach_type type);
+                                    enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
-                                     short access, enum bpf_attach_type type);
+                                     short access, enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
                                   struct ctl_table *table, int write,
                                   char **buf, size_t *pcount, loff_t *ppos,
-                                  enum bpf_attach_type type);
+                                  enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
                                       int *optname, char __user *optval,
@@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)                            \
 ({                                                                           \
        int __ret = 0;                                                        \
-       if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS))                      \
+       if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))                  \
                __ret = __cgroup_bpf_run_filter_skb(sk, skb,                  \
-                                                   BPF_CGROUP_INET_INGRESS); \
+                                                   CGROUP_INET_INGRESS); \
                                                                              \
        __ret;                                                                \
 })
@@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)                              \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
+       if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
                typeof(sk) __sk = sk_to_full_sk(sk);                           \
                if (sk_fullsock(__sk))                                         \
                        __ret = __cgroup_bpf_run_filter_skb(__sk, skb,         \
-                                                     BPF_CGROUP_INET_EGRESS); \
+                                                     CGROUP_INET_EGRESS); \
        }                                                                      \
        __ret;                                                                 \
 })
 
-#define BPF_CGROUP_RUN_SK_PROG(sk, type)                                      \
+#define BPF_CGROUP_RUN_SK_PROG(sk, atype)                                     \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(type)) {                                        \
-               __ret = __cgroup_bpf_run_filter_sk(sk, type);                  \
+       if (cgroup_bpf_enabled(atype)) {                                               \
+               __ret = __cgroup_bpf_run_filter_sk(sk, atype);                 \
        }                                                                      \
        __ret;                                                                 \
 })
 
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)                                     \
-       BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+       BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE)
 
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk)                             \
-       BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE)
+       BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE)
 
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)                                       \
-       BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+       BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND)
 
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)                                       \
-       BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+       BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)                                       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)                                      \
 ({                                                                            \
        u32 __unused_flags;                                                    \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(type))                                          \
-               __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+       if (cgroup_bpf_enabled(atype))                                         \
+               __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
                                                          NULL,                \
                                                          &__unused_flags);    \
        __ret;                                                                 \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)                   \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)                  \
 ({                                                                            \
        u32 __unused_flags;                                                    \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(type))   {                                      \
+       if (cgroup_bpf_enabled(atype))  {                                      \
                lock_sock(sk);                                                 \
-               __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+               __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
                                                          t_ctx,               \
                                                          &__unused_flags);    \
                release_sock(sk);                                              \
@@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags)               \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)              \
 ({                                                                            \
        u32 __flags = 0;                                                       \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(type))   {                                      \
+       if (cgroup_bpf_enabled(atype))  {                                      \
                lock_sock(sk);                                                 \
-               __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+               __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
                                                          NULL, &__flags);     \
                release_sock(sk);                                              \
                if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)            \
@@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk)                                    \
-       ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) ||                      \
-         cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) &&                     \
+       ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) ||                  \
+         cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&                 \
         (sk)->sk_prot->pre_connect)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)                          \
-       BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+       BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)                          \
-       BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+       BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)                     \
-       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)                     \
-       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)                       \
-       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)                       \
-       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)                       \
-       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL)
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)                       \
-       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk)                  \
 ({                                                                     \
        int __ret = 0;                                                  \
-       if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS))                    \
+       if (cgroup_bpf_enabled(CGROUP_SOCK_OPS))                        \
                __ret = __cgroup_bpf_run_filter_sock_ops(sk,            \
                                                         sock_ops,      \
-                                                        BPF_CGROUP_SOCK_OPS); \
+                                                        CGROUP_SOCK_OPS); \
        __ret;                                                          \
 })
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)                                \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
+       if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
                typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk);               \
                if (__sk && sk_fullsock(__sk))                                 \
                        __ret = __cgroup_bpf_run_filter_sock_ops(__sk,         \
                                                                 sock_ops,     \
-                                                        BPF_CGROUP_SOCK_OPS); \
+                                                        CGROUP_SOCK_OPS); \
        }                                                                      \
        __ret;                                                                 \
 })
 
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)        \
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access)       \
 ({                                                                           \
        int __ret = 0;                                                        \
-       if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE))                            \
-               __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+       if (cgroup_bpf_enabled(CGROUP_DEVICE))                        \
+               __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \
                                                          access,             \
-                                                         BPF_CGROUP_DEVICE); \
+                                                         CGROUP_DEVICE); \
                                                                              \
        __ret;                                                                \
 })
@@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos)  \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL))                             \
+       if (cgroup_bpf_enabled(CGROUP_SYSCTL))                         \
                __ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
                                                       buf, count, pos,        \
-                                                      BPF_CGROUP_SYSCTL);     \
+                                                      CGROUP_SYSCTL);     \
        __ret;                                                                 \
 })
 
@@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
                                       kernel_optval)                          \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT))                         \
+       if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT))                             \
                __ret = __cgroup_bpf_run_filter_setsockopt(sock, level,        \
                                                           optname, optval,    \
                                                           optlen,             \
@@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen)                              \
 ({                                                                            \
        int __ret = 0;                                                         \
-       if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))                         \
+       if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))                             \
                get_user(__ret, optlen);                                       \
        __ret;                                                                 \
 })
@@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
                                       max_optlen, retval)                     \
 ({                                                                            \
        int __ret = retval;                                                    \
-       if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))                         \
+       if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))                             \
                if (!(sock)->sk_prot->bpf_bypass_getsockopt ||                 \
                    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
                                        tcp_bpf_bypass_getsockopt,             \
@@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
                                            optlen, retval)                    \
 ({                                                                            \
        int __ret = retval;                                                    \
-       if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))                         \
+       if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))                             \
                __ret = __cgroup_bpf_run_filter_getsockopt_kern(               \
                        sock, level, optname, optval, optlen, retval);         \
        __ret;                                                                 \
@@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
        return 0;
 }
 
-#define cgroup_bpf_enabled(type) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
+#define cgroup_bpf_enabled(atype) (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
@@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
index c8cc090..f4c16f1 100644 (file)
@@ -1103,7 +1103,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 /* an array of programs to be executed under rcu_lock.
  *
  * Typical usage:
- * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run);
  *
  * the structure returned by bpf_prog_array_alloc() should be populated
  * with program pointers and the last pointer must be NULL.
@@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
  */
 struct bpf_prog_array_item {
        struct bpf_prog *prog;
-       struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+       union {
+               struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+               u64 bpf_cookie;
+       };
 };
 
 struct bpf_prog_array {
@@ -1140,73 +1143,133 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
+                       u64 bpf_cookie,
                        struct bpf_prog_array **new_array);
 
 struct bpf_run_ctx {};
 
 struct bpf_cg_run_ctx {
        struct bpf_run_ctx run_ctx;
-       struct bpf_prog_array_item *prog_item;
+       const struct bpf_prog_array_item *prog_item;
+};
+
+struct bpf_trace_run_ctx {
+       struct bpf_run_ctx run_ctx;
+       u64 bpf_cookie;
 };
 
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
+{
+       struct bpf_run_ctx *old_ctx = NULL;
+
+#ifdef CONFIG_BPF_SYSCALL
+       old_ctx = current->bpf_ctx;
+       current->bpf_ctx = new_ctx;
+#endif
+       return old_ctx;
+}
+
+static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
+{
+#ifdef CONFIG_BPF_SYSCALL
+       current->bpf_ctx = old_ctx;
+#endif
+}
+
 /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
 #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE                   (1 << 0)
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN                                         (1 << 0)
 
-#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)          \
-       ({                                                              \
-               struct bpf_prog_array_item *_item;                      \
-               struct bpf_prog *_prog;                                 \
-               struct bpf_prog_array *_array;                          \
-               struct bpf_run_ctx *old_run_ctx;                        \
-               struct bpf_cg_run_ctx run_ctx;                          \
-               u32 _ret = 1;                                           \
-               u32 func_ret;                                           \
-               migrate_disable();                                      \
-               rcu_read_lock();                                        \
-               _array = rcu_dereference(array);                        \
-               _item = &_array->items[0];                              \
-               old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);        \
-               while ((_prog = READ_ONCE(_item->prog))) {              \
-                       run_ctx.prog_item = _item;                      \
-                       func_ret = func(_prog, ctx);                    \
-                       _ret &= (func_ret & 1);                         \
-                       *(ret_flags) |= (func_ret >> 1);                \
-                       _item++;                                        \
-               }                                                       \
-               bpf_reset_run_ctx(old_run_ctx);                         \
-               rcu_read_unlock();                                      \
-               migrate_enable();                                       \
-               _ret;                                                   \
-        })
-
-#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \
-       ({                                              \
-               struct bpf_prog_array_item *_item;      \
-               struct bpf_prog *_prog;                 \
-               struct bpf_prog_array *_array;          \
-               struct bpf_run_ctx *old_run_ctx;        \
-               struct bpf_cg_run_ctx run_ctx;          \
-               u32 _ret = 1;                           \
-               migrate_disable();                      \
-               rcu_read_lock();                        \
-               _array = rcu_dereference(array);        \
-               if (unlikely(check_non_null && !_array))\
-                       goto _out;                      \
-               _item = &_array->items[0];              \
-               old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
-               while ((_prog = READ_ONCE(_item->prog))) {      \
-                       run_ctx.prog_item = _item;      \
-                       _ret &= func(_prog, ctx);       \
-                       _item++;                        \
-               }                                       \
-               bpf_reset_run_ctx(old_run_ctx);         \
-_out:                                                  \
-               rcu_read_unlock();                      \
-               migrate_enable();                       \
-               _ret;                                   \
-        })
+typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);
+
+static __always_inline u32
+BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
+                           const void *ctx, bpf_prog_run_fn run_prog,
+                           u32 *ret_flags)
+{
+       const struct bpf_prog_array_item *item;
+       const struct bpf_prog *prog;
+       const struct bpf_prog_array *array;
+       struct bpf_run_ctx *old_run_ctx;
+       struct bpf_cg_run_ctx run_ctx;
+       u32 ret = 1;
+       u32 func_ret;
+
+       migrate_disable();
+       rcu_read_lock();
+       array = rcu_dereference(array_rcu);
+       item = &array->items[0];
+       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+       while ((prog = READ_ONCE(item->prog))) {
+               run_ctx.prog_item = item;
+               func_ret = run_prog(prog, ctx);
+               ret &= (func_ret & 1);
+               *(ret_flags) |= (func_ret >> 1);
+               item++;
+       }
+       bpf_reset_run_ctx(old_run_ctx);
+       rcu_read_unlock();
+       migrate_enable();
+       return ret;
+}
+
+static __always_inline u32
+BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
+                     const void *ctx, bpf_prog_run_fn run_prog)
+{
+       const struct bpf_prog_array_item *item;
+       const struct bpf_prog *prog;
+       const struct bpf_prog_array *array;
+       struct bpf_run_ctx *old_run_ctx;
+       struct bpf_cg_run_ctx run_ctx;
+       u32 ret = 1;
+
+       migrate_disable();
+       rcu_read_lock();
+       array = rcu_dereference(array_rcu);
+       item = &array->items[0];
+       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+       while ((prog = READ_ONCE(item->prog))) {
+               run_ctx.prog_item = item;
+               ret &= run_prog(prog, ctx);
+               item++;
+       }
+       bpf_reset_run_ctx(old_run_ctx);
+       rcu_read_unlock();
+       migrate_enable();
+       return ret;
+}
+
+static __always_inline u32
+BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
+                  const void *ctx, bpf_prog_run_fn run_prog)
+{
+       const struct bpf_prog_array_item *item;
+       const struct bpf_prog *prog;
+       const struct bpf_prog_array *array;
+       struct bpf_run_ctx *old_run_ctx;
+       struct bpf_trace_run_ctx run_ctx;
+       u32 ret = 1;
+
+       migrate_disable();
+       rcu_read_lock();
+       array = rcu_dereference(array_rcu);
+       if (unlikely(!array))
+               goto out;
+       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+       item = &array->items[0];
+       while ((prog = READ_ONCE(item->prog))) {
+               run_ctx.bpf_cookie = item->bpf_cookie;
+               ret &= run_prog(prog, ctx);
+               item++;
+       }
+       bpf_reset_run_ctx(old_run_ctx);
+out:
+       rcu_read_unlock();
+       migrate_enable();
+       return ret;
+}
 
 /* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
  * so BPF programs can request cwr for TCP packets.
@@ -1235,7 +1298,7 @@ _out:                                                     \
                u32 _flags = 0;                         \
                bool _cn;                               \
                u32 _ret;                               \
-               _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \
+               _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \
                _cn = _flags & BPF_RET_SET_CN;          \
                if (_ret)                               \
                        _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);  \
@@ -1244,12 +1307,6 @@ _out:                                                    \
                _ret;                                   \
        })
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)           \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
-
-#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)     \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
-
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
@@ -1284,20 +1341,6 @@ static inline void bpf_enable_instrumentation(void)
        migrate_enable();
 }
 
-static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
-{
-       struct bpf_run_ctx *old_ctx;
-
-       old_ctx = current->bpf_ctx;
-       current->bpf_ctx = new_ctx;
-       return old_ctx;
-}
-
-static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
-{
-       current->bpf_ctx = old_ctx;
-}
-
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2059,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 
-const struct bpf_func_proto *bpf_tracing_func_proto(
-       enum bpf_func_id func_id, const struct bpf_prog *prog);
-
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
 
index ae3ac3a..9c81724 100644 (file)
@@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 #endif
+#ifdef CONFIG_PERF_EVENTS
+BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
+#endif
index 5cdeab4..546e27f 100644 (file)
@@ -62,9 +62,17 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
        return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
 }
 
-static inline void *memdup_bpfptr(bpfptr_t src, size_t len)
+static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
 {
-       return memdup_sockptr((sockptr_t) src, len);
+       void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);
+
+       if (!p)
+               return ERR_PTR(-ENOMEM);
+       if (copy_from_bpfptr(p, src, len)) {
+               kvfree(p);
+               return ERR_PTR(-EFAULT);
+       }
+       return p;
 }
 
 static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
index 57890b3..47d9abf 100644 (file)
@@ -82,6 +82,9 @@ __BTF_ID_LIST(name, globl)
 #define BTF_ID_LIST_SINGLE(name, prefix, typename)     \
        BTF_ID_LIST(name) \
        BTF_ID(prefix, typename)
+#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \
+       BTF_ID_LIST_GLOBAL(name) \
+       BTF_ID(prefix, typename)
 
 /*
  * The BTF_ID_UNUSED macro defines 4 zero bytes.
@@ -148,6 +151,7 @@ extern struct btf_id_set name;
 #define BTF_ID_UNUSED
 #define BTF_ID_LIST_GLOBAL(name) u32 name[1];
 #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1];
+#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1];
 #define BTF_SET_START(name) static struct btf_id_set name = { 0 };
 #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 };
 #define BTF_SET_END(name)
@@ -172,7 +176,8 @@ extern struct btf_id_set name;
        BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock)          \
        BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock)                    \
        BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)                      \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)                    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock)
 
 enum {
 #define BTF_SOCK_TYPE(name, str) name,
@@ -184,4 +189,6 @@ MAX_BTF_SOCK_TYPE,
 extern u32 btf_sock_ids[];
 #endif
 
+extern u32 btf_task_struct_ids[];
+
 #endif
index 1797e85..7d24894 100644 (file)
@@ -600,25 +600,38 @@ struct sk_filter {
 
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
-#define __BPF_PROG_RUN(prog, ctx, dfunc)       ({                      \
-       u32 __ret;                                                      \
-       cant_migrate();                                                 \
-       if (static_branch_unlikely(&bpf_stats_enabled_key)) {           \
-               struct bpf_prog_stats *__stats;                         \
-               u64 __start = sched_clock();                            \
-               __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);   \
-               __stats = this_cpu_ptr(prog->stats);                    \
-               u64_stats_update_begin(&__stats->syncp);                \
-               __stats->cnt++;                                         \
-               __stats->nsecs += sched_clock() - __start;              \
-               u64_stats_update_end(&__stats->syncp);                  \
-       } else {                                                        \
-               __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);   \
-       }                                                               \
-       __ret; })
-
-#define BPF_PROG_RUN(prog, ctx)                                                \
-       __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)
+typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
+                                         const struct bpf_insn *insnsi,
+                                         unsigned int (*bpf_func)(const void *,
+                                                                  const struct bpf_insn *));
+
+static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
+                                         const void *ctx,
+                                         bpf_dispatcher_fn dfunc)
+{
+       u32 ret;
+
+       cant_migrate();
+       if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+               struct bpf_prog_stats *stats;
+               u64 start = sched_clock();
+
+               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+               stats = this_cpu_ptr(prog->stats);
+               u64_stats_update_begin(&stats->syncp);
+               stats->cnt++;
+               stats->nsecs += sched_clock() - start;
+               u64_stats_update_end(&stats->syncp);
+       } else {
+               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+       }
+       return ret;
+}
+
+static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
+{
+       return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
+}
 
 /*
  * Use in preemptible and therefore migratable context to make sure that
@@ -637,7 +650,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
        u32 ret;
 
        migrate_disable();
-       ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func);
+       ret = bpf_prog_run(prog, ctx);
        migrate_enable();
        return ret;
 }
@@ -710,7 +723,7 @@ static inline void bpf_restore_data_end(
        cb->data_end = saved_data_end;
 }
 
-static inline u8 *bpf_skb_cb(struct sk_buff *skb)
+static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
 {
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
@@ -731,8 +744,9 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 
 /* Must be invoked with migration disabled */
 static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
-                                        struct sk_buff *skb)
+                                        const void *ctx)
 {
+       const struct sk_buff *skb = ctx;
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;
@@ -742,7 +756,7 @@ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                memset(cb_data, 0, sizeof(cb_saved));
        }
 
-       res = BPF_PROG_RUN(prog, skb);
+       res = bpf_prog_run(prog, skb);
 
        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));
@@ -787,7 +801,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
         * under local_bh_disable(), which provides the needed RCU protection
         * for accessing map entries.
         */
-       u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
+       u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
 
        if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) {
                if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev))
@@ -1440,7 +1454,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
                };
                u32 act;
 
-               act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+               act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
@@ -1478,7 +1492,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
                };
                u32 act;
 
-               act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+               act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
index 2d510ad..fe156a8 100644 (file)
@@ -762,6 +762,7 @@ struct perf_event {
 #ifdef CONFIG_BPF_SYSCALL
        perf_overflow_handler_t         orig_overflow_handler;
        struct bpf_prog                 *prog;
+       u64                             bpf_cookie;
 #endif
 
 #ifdef CONFIG_EVENT_TRACING
index ad413b3..8e0631a 100644 (file)
@@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
-int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
@@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
 }
 
 static inline int
-perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
 {
        return -EOPNOTSUPP;
 }
@@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
+void perf_event_free_bpf_prog(struct perf_event *event);
+
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
 void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
 void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
index 4757d7f..7d142e8 100644 (file)
@@ -87,6 +87,8 @@ long unix_outq_len(struct sock *sk);
 
 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
                         int flags);
+int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+                         int flags);
 #ifdef CONFIG_SYSCTL
 int unix_sysctl_register(struct net *net);
 void unix_sysctl_unregister(struct net *net);
@@ -96,9 +98,11 @@ static inline void unix_sysctl_unregister(struct net *net) {}
 #endif
 
 #ifdef CONFIG_BPF_SYSCALL
-extern struct proto unix_proto;
+extern struct proto unix_dgram_proto;
+extern struct proto unix_stream_proto;
 
-int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void __init unix_bpf_build_proto(void);
 #else
 static inline void __init unix_bpf_build_proto(void)
index 2db6925..791f31d 100644 (file)
@@ -84,7 +84,7 @@ struct bpf_lpm_trie_key {
 
 struct bpf_cgroup_storage_key {
        __u64   cgroup_inode_id;        /* cgroup inode id */
-       __u32   attach_type;            /* program attach type */
+       __u32   attach_type;            /* program attach type (enum bpf_attach_type) */
 };
 
 union bpf_iter_link_info {
@@ -993,6 +993,7 @@ enum bpf_attach_type {
        BPF_SK_SKB_VERDICT,
        BPF_SK_REUSEPORT_SELECT,
        BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+       BPF_PERF_EVENT,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
        BPF_LINK_TYPE_ITER = 4,
        BPF_LINK_TYPE_NETNS = 5,
        BPF_LINK_TYPE_XDP = 6,
+       BPF_LINK_TYPE_PERF_EVENT = 7,
 
        MAX_BPF_LINK_TYPE,
 };
@@ -1446,6 +1448,13 @@ union bpf_attr {
                                __aligned_u64   iter_info;      /* extra bpf_iter_link_info */
                                __u32           iter_info_len;  /* iter_info length */
                        };
+                       struct {
+                               /* black box user-provided value passed through
+                                * to BPF program at the execution time and
+                                * accessible through bpf_get_attach_cookie() BPF helper
+                                */
+                               __u64           bpf_cookie;
+                       } perf_event;
                };
        } link_create;
 
@@ -4847,6 +4856,27 @@ union bpf_attr {
  *             Get address of the traced function (for tracing and kprobe programs).
  *     Return
  *             Address of the traced function.
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ *     Description
+ *             Get bpf_cookie value provided (optionally) during the program
+ *             attachment. It might be different for each individual
+ *             attachment, even if BPF program itself is the same.
+ *             Expects BPF program context *ctx* as a first argument.
+ *
+ *             Supported for the following program types:
+ *                     - kprobe/uprobe;
+ *                     - tracepoint;
+ *                     - perf_event.
+ *     Return
+ *             Value specified by user at BPF link creation/attachment time
+ *             or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ *     Description
+ *             Get the struct pt_regs associated with **task**.
+ *     Return
+ *             A pointer to struct pt_regs.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -5023,6 +5053,8 @@ union bpf_attr {
        FN(timer_start),                \
        FN(timer_cancel),               \
        FN(get_func_ip),                \
+       FN(get_attach_cookie),          \
+       FN(task_pt_regs),               \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
index 2e9d47b..b2ee450 100644 (file)
@@ -686,7 +686,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 
        rcu_read_lock();
        migrate_disable();
-       ret = BPF_PROG_RUN(prog, ctx);
+       ret = bpf_prog_run(prog, ctx);
        migrate_enable();
        rcu_read_unlock();
 
index 70f6fd4..d6731c3 100644 (file)
@@ -28,6 +28,7 @@ struct bpf_struct_ops_value {
 
 struct bpf_struct_ops_map {
        struct bpf_map map;
+       struct rcu_head rcu;
        const struct bpf_struct_ops *st_ops;
        /* protect map_update */
        struct mutex lock;
@@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata)
        return refcount_inc_not_zero(&kvalue->refcnt);
 }
 
+static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+{
+       struct bpf_struct_ops_map *st_map;
+
+       st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+       bpf_map_put(&st_map->map);
+}
+
 void bpf_struct_ops_put(const void *kdata)
 {
        struct bpf_struct_ops_value *kvalue;
@@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata)
 
                st_map = container_of(kvalue, struct bpf_struct_ops_map,
                                      kvalue);
-               bpf_map_put(&st_map->map);
+               /* The struct_ops's function may switch to another struct_ops.
+                *
+                * For example, bpf_tcp_cc_x->init() may switch to
+                * another tcp_cc_y by calling
+                * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+                * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+                * and its map->refcnt may reach 0 which then free its
+                * trampoline image while tcp_cc_x is still running.
+                *
+                * Thus, a rcu grace period is needed here.
+                */
+               call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
        }
 }
index 3ce7575..ebfa8bc 100644 (file)
@@ -317,15 +317,13 @@ const struct bpf_map_ops task_storage_map_ops = {
        .map_owner_storage_ptr = task_storage_ptr,
 };
 
-BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct)
-
 const struct bpf_func_proto bpf_task_storage_get_proto = {
        .func = bpf_task_storage_get,
        .gpl_only = false,
        .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
        .arg1_type = ARG_CONST_MAP_PTR,
        .arg2_type = ARG_PTR_TO_BTF_ID,
-       .arg2_btf_id = &bpf_task_storage_btf_ids[0],
+       .arg2_btf_id = &btf_task_struct_ids[0],
        .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
        .arg4_type = ARG_ANYTHING,
 };
@@ -336,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
        .ret_type = RET_INTEGER,
        .arg1_type = ARG_CONST_MAP_PTR,
        .arg2_type = ARG_PTR_TO_BTF_ID,
-       .arg2_btf_id = &bpf_task_storage_btf_ids[0],
+       .arg2_btf_id = &btf_task_struct_ids[0],
 };
index c395024..dfe61df 100644 (file)
@@ -6213,3 +6213,5 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
        .arg3_type      = ARG_ANYTHING,
        .arg4_type      = ARG_ANYTHING,
 };
+
+BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
index b567ca4..03145d4 100644 (file)
@@ -19,7 +19,7 @@
 
 #include "../cgroup/cgroup-internal.h"
 
-DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
 void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work)
        struct list_head *storages = &cgrp->bpf.storages;
        struct bpf_cgroup_storage *storage, *stmp;
 
-       unsigned int type;
+       unsigned int atype;
 
        mutex_lock(&cgroup_mutex);
 
-       for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
-               struct list_head *progs = &cgrp->bpf.progs[type];
+       for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+               struct list_head *progs = &cgrp->bpf.progs[atype];
                struct bpf_prog_list *pl, *pltmp;
 
                list_for_each_entry_safe(pl, pltmp, progs, node) {
@@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work)
                        if (pl->link)
                                bpf_cgroup_link_auto_detach(pl->link);
                        kfree(pl);
-                       static_branch_dec(&cgroup_bpf_enabled_key[type]);
+                       static_branch_dec(&cgroup_bpf_enabled_key[atype]);
                }
                old_array = rcu_dereference_protected(
-                               cgrp->bpf.effective[type],
+                               cgrp->bpf.effective[atype],
                                lockdep_is_held(&cgroup_mutex));
                bpf_prog_array_free(old_array);
        }
@@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head)
  * if parent has overridable or multi-prog, allow attaching
  */
 static bool hierarchy_allows_attach(struct cgroup *cgrp,
-                                   enum bpf_attach_type type)
+                                   enum cgroup_bpf_attach_type atype)
 {
        struct cgroup *p;
 
@@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
        if (!p)
                return true;
        do {
-               u32 flags = p->bpf.flags[type];
+               u32 flags = p->bpf.flags[atype];
                u32 cnt;
 
                if (flags & BPF_F_ALLOW_MULTI)
                        return true;
-               cnt = prog_list_length(&p->bpf.progs[type]);
+               cnt = prog_list_length(&p->bpf.progs[atype]);
                WARN_ON_ONCE(cnt > 1);
                if (cnt == 1)
                        return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
  * to programs in this cgroup
  */
 static int compute_effective_progs(struct cgroup *cgrp,
-                                  enum bpf_attach_type type,
+                                  enum cgroup_bpf_attach_type atype,
                                   struct bpf_prog_array **array)
 {
        struct bpf_prog_array_item *item;
@@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
 
        /* count number of effective programs by walking parents */
        do {
-               if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
-                       cnt += prog_list_length(&p->bpf.progs[type]);
+               if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+                       cnt += prog_list_length(&p->bpf.progs[atype]);
                p = cgroup_parent(p);
        } while (p);
 
@@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
        cnt = 0;
        p = cgrp;
        do {
-               if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+               if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                        continue;
 
-               list_for_each_entry(pl, &p->bpf.progs[type], node) {
+               list_for_each_entry(pl, &p->bpf.progs[atype], node) {
                        if (!prog_list_prog(pl))
                                continue;
 
@@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
 }
 
 static void activate_effective_progs(struct cgroup *cgrp,
-                                    enum bpf_attach_type type,
+                                    enum cgroup_bpf_attach_type atype,
                                     struct bpf_prog_array *old_array)
 {
-       old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+       old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
                                        lockdep_is_held(&cgroup_mutex));
        /* free prog array after grace period, since __cgroup_bpf_run_*()
         * might be still walking the array
@@ -328,7 +328,7 @@ cleanup:
 }
 
 static int update_effective_progs(struct cgroup *cgrp,
-                                 enum bpf_attach_type type)
+                                 enum cgroup_bpf_attach_type atype)
 {
        struct cgroup_subsys_state *css;
        int err;
@@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp,
                if (percpu_ref_is_zero(&desc->bpf.refcnt))
                        continue;
 
-               err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+               err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
                if (err)
                        goto cleanup;
        }
@@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp,
                        continue;
                }
 
-               activate_effective_progs(desc, type, desc->bpf.inactive);
+               activate_effective_progs(desc, atype, desc->bpf.inactive);
                desc->bpf.inactive = NULL;
        }
 
@@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
                        enum bpf_attach_type type, u32 flags)
 {
        u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
-       struct list_head *progs = &cgrp->bpf.progs[type];
        struct bpf_prog *old_prog = NULL;
        struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
        struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+       enum cgroup_bpf_attach_type atype;
        struct bpf_prog_list *pl;
+       struct list_head *progs;
        int err;
 
        if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
                /* replace_prog implies BPF_F_REPLACE, and vice versa */
                return -EINVAL;
 
-       if (!hierarchy_allows_attach(cgrp, type))
+       atype = to_cgroup_bpf_attach_type(type);
+       if (atype < 0)
+               return -EINVAL;
+
+       progs = &cgrp->bpf.progs[atype];
+
+       if (!hierarchy_allows_attach(cgrp, atype))
                return -EPERM;
 
-       if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+       if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
                /* Disallow attaching non-overridable on top
                 * of existing overridable in this cgroup.
                 * Disallow attaching multi-prog if overridable or none
@@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
        pl->prog = prog;
        pl->link = link;
        bpf_cgroup_storages_assign(pl->storage, storage);
-       cgrp->bpf.flags[type] = saved_flags;
+       cgrp->bpf.flags[atype] = saved_flags;
 
-       err = update_effective_progs(cgrp, type);
+       err = update_effective_progs(cgrp, atype);
        if (err)
                goto cleanup;
 
        if (old_prog)
                bpf_prog_put(old_prog);
        else
-               static_branch_inc(&cgroup_bpf_enabled_key[type]);
+               static_branch_inc(&cgroup_bpf_enabled_key[atype]);
        bpf_cgroup_storages_link(new_storage, cgrp, type);
        return 0;
 
@@ -520,7 +527,7 @@ cleanup:
  * all descendant cgroups. This function is guaranteed to succeed.
  */
 static void replace_effective_prog(struct cgroup *cgrp,
-                                  enum bpf_attach_type type,
+                                  enum cgroup_bpf_attach_type atype,
                                   struct bpf_cgroup_link *link)
 {
        struct bpf_prog_array_item *item;
@@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp,
 
                /* find position of link in effective progs array */
                for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
-                       if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+                       if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                                continue;
 
-                       head = &cg->bpf.progs[type];
+                       head = &cg->bpf.progs[atype];
                        list_for_each_entry(pl, head, node) {
                                if (!prog_list_prog(pl))
                                        continue;
@@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
 found:
                BUG_ON(!cg);
                progs = rcu_dereference_protected(
-                               desc->bpf.effective[type],
+                               desc->bpf.effective[atype],
                                lockdep_is_held(&cgroup_mutex));
                item = &progs->items[pos];
                WRITE_ONCE(item->prog, link->link.prog);
@@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
                                struct bpf_cgroup_link *link,
                                struct bpf_prog *new_prog)
 {
-       struct list_head *progs = &cgrp->bpf.progs[link->type];
+       enum cgroup_bpf_attach_type atype;
        struct bpf_prog *old_prog;
        struct bpf_prog_list *pl;
+       struct list_head *progs;
        bool found = false;
 
+       atype = to_cgroup_bpf_attach_type(link->type);
+       if (atype < 0)
+               return -EINVAL;
+
+       progs = &cgrp->bpf.progs[atype];
+
        if (link->link.prog->type != new_prog->type)
                return -EINVAL;
 
@@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
                return -ENOENT;
 
        old_prog = xchg(&link->link.prog, new_prog);
-       replace_effective_prog(cgrp, link->type, link);
+       replace_effective_prog(cgrp, atype, link);
        bpf_prog_put(old_prog);
        return 0;
 }
@@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
                        struct bpf_cgroup_link *link, enum bpf_attach_type type)
 {
-       struct list_head *progs = &cgrp->bpf.progs[type];
-       u32 flags = cgrp->bpf.flags[type];
-       struct bpf_prog_list *pl;
+       enum cgroup_bpf_attach_type atype;
        struct bpf_prog *old_prog;
+       struct bpf_prog_list *pl;
+       struct list_head *progs;
+       u32 flags;
        int err;
 
+       atype = to_cgroup_bpf_attach_type(type);
+       if (atype < 0)
+               return -EINVAL;
+
+       progs = &cgrp->bpf.progs[atype];
+       flags = cgrp->bpf.flags[atype];
+
        if (prog && link)
                /* only one of prog or link can be specified */
                return -EINVAL;
@@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
        pl->prog = NULL;
        pl->link = NULL;
 
-       err = update_effective_progs(cgrp, type);
+       err = update_effective_progs(cgrp, atype);
        if (err)
                goto cleanup;
 
@@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
        kfree(pl);
        if (list_empty(progs))
                /* last program was detached, reset flags to zero */
-               cgrp->bpf.flags[type] = 0;
+               cgrp->bpf.flags[atype] = 0;
        if (old_prog)
                bpf_prog_put(old_prog);
-       static_branch_dec(&cgroup_bpf_enabled_key[type]);
+       static_branch_dec(&cgroup_bpf_enabled_key[atype]);
        return 0;
 
 cleanup:
@@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 {
        __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
        enum bpf_attach_type type = attr->query.attach_type;
-       struct list_head *progs = &cgrp->bpf.progs[type];
-       u32 flags = cgrp->bpf.flags[type];
+       enum cgroup_bpf_attach_type atype;
        struct bpf_prog_array *effective;
+       struct list_head *progs;
        struct bpf_prog *prog;
        int cnt, ret = 0, i;
+       u32 flags;
 
-       effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+       atype = to_cgroup_bpf_attach_type(type);
+       if (atype < 0)
+               return -EINVAL;
+
+       progs = &cgrp->bpf.progs[atype];
+       flags = cgrp->bpf.flags[atype];
+
+       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
                                              lockdep_is_held(&cgroup_mutex));
 
        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
@@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
        link->cgroup = cgrp;
        link->type = attr->link_create.attach_type;
 
-       err  = bpf_link_prime(&link->link, &link_primer);
+       err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto out_put_cgroup;
        }
 
-       err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
-                               BPF_F_ALLOW_MULTI);
+       err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+                               link->type, BPF_F_ALLOW_MULTI);
        if (err) {
                bpf_link_cleanup(&link_primer);
                goto out_put_cgroup;
@@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  */
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
                                struct sk_buff *skb,
-                               enum bpf_attach_type type)
+                               enum cgroup_bpf_attach_type atype)
 {
        unsigned int offset = skb->data - skb_network_header(skb);
        struct sock *save_sk;
@@ -1008,12 +1038,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
        /* compute pointers for the bpf prog */
        bpf_compute_and_save_data_end(skb, &saved_data_end);
 
-       if (type == BPF_CGROUP_INET_EGRESS) {
+       if (atype == CGROUP_INET_EGRESS) {
                ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
-                       cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+                       cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
        } else {
-               ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
-                                         __bpf_prog_run_save_cb);
+               ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
+                                           __bpf_prog_run_save_cb);
                ret = (ret == 1 ? 0 : -EPERM);
        }
        bpf_restore_data_end(skb, saved_data_end);
@@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
-                              enum bpf_attach_type type)
+                              enum cgroup_bpf_attach_type atype)
 {
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        int ret;
 
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
        return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
                                      struct sockaddr *uaddr,
-                                     enum bpf_attach_type type,
+                                     enum cgroup_bpf_attach_type atype,
                                      void *t_ctx,
                                      u32 *flags)
 {
@@ -1090,8 +1120,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
        }
 
        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-       ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
-                                      BPF_PROG_RUN, flags);
+       ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+                                         bpf_prog_run, flags);
 
        return ret == 1 ? 0 : -EPERM;
 }
@@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
  */
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
                                     struct bpf_sock_ops_kern *sock_ops,
-                                    enum bpf_attach_type type)
+                                    enum cgroup_bpf_attach_type atype)
 {
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        int ret;
 
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
-                                BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+                                   bpf_prog_run);
        return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
-                                     short access, enum bpf_attach_type type)
+                                     short access, enum cgroup_bpf_attach_type atype)
 {
        struct cgroup *cgrp;
        struct bpf_cgroup_dev_ctx ctx = {
@@ -1135,12 +1165,12 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
                .major = major,
                .minor = minor,
        };
-       int allow = 1;
+       int allow;
 
        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
-       allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
-                                  BPF_PROG_RUN);
+       allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+                                     bpf_prog_run);
        rcu_read_unlock();
 
        return !allow;
@@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
                                   struct ctl_table *table, int write,
                                   char **buf, size_t *pcount, loff_t *ppos,
-                                  enum bpf_attach_type type)
+                                  enum cgroup_bpf_attach_type atype)
 {
        struct bpf_sysctl_kern ctx = {
                .head = head,
@@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
        rcu_read_unlock();
 
        kfree(ctx.cur_val);
@@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 #ifdef CONFIG_NET
 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
-                                            enum bpf_attach_type attach_type)
+                                            enum cgroup_bpf_attach_type attach_type)
 {
        struct bpf_prog_array *prog_array;
        bool empty;
@@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
         * attached to the hook so we don't waste time allocating
         * memory and locking the socket.
         */
-       if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+       if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
                return 0;
 
        /* Allocate a bit more than the initial user buffer for
@@ -1385,8 +1415,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
        }
 
        lock_sock(sk);
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
-                                &ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
+                                   &ctx, bpf_prog_run);
        release_sock(sk);
 
        if (!ret) {
@@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
         * attached to the hook so we don't waste time allocating
         * memory and locking the socket.
         */
-       if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+       if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
                return retval;
 
        ctx.optlen = max_optlen;
@@ -1495,8 +1525,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
        }
 
        lock_sock(sk);
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
-                                &ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+                                   &ctx, bpf_prog_run);
        release_sock(sk);
 
        if (!ret) {
@@ -1556,8 +1586,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
         * be called if that data shouldn't be "exported".
         */
 
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
-                                &ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+                                   &ctx, bpf_prog_run);
        if (!ret)
                return -EPERM;
 
@@ -1846,15 +1876,41 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
 const struct bpf_prog_ops cg_sysctl_prog_ops = {
 };
 
+#ifdef CONFIG_NET
+BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
+{
+       const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
+
+       return net->net_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
+       .func           = bpf_get_netns_cookie_sockopt,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
+};
+#endif
+
 static const struct bpf_func_proto *
 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
        switch (func_id) {
 #ifdef CONFIG_NET
+       case BPF_FUNC_get_netns_cookie:
+               return &bpf_get_netns_cookie_sockopt_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
+       case BPF_FUNC_setsockopt:
+               if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+                       return &bpf_sk_setsockopt_proto;
+               return NULL;
+       case BPF_FUNC_getsockopt:
+               if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+                       return &bpf_sk_getsockopt_proto;
+               return NULL;
 #endif
 #ifdef CONFIG_INET
        case BPF_FUNC_tcp_sock:
index 82af627..9f4636d 100644 (file)
@@ -1564,7 +1564,7 @@ select_insn:
 
                if (unlikely(index >= array->map.max_entries))
                        goto out;
-               if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
+               if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
                        goto out;
 
                tail_call_cnt++;
@@ -1879,7 +1879,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
  *     @err: pointer to error variable
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via BPF_PROG_RUN() macro.
+ * The BPF program will be executed via bpf_prog_run() function.
  *
  * Return: the &fp argument along with &err set to 0 for success or
  * a negative errno code on failure
@@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
+                       u64 bpf_cookie,
                        struct bpf_prog_array **new_array)
 {
        int new_prog_cnt, carry_prog_cnt = 0;
-       struct bpf_prog_array_item *existing;
+       struct bpf_prog_array_item *existing, *new;
        struct bpf_prog_array *array;
        bool found_exclude = false;
-       int new_prog_idx = 0;
 
        /* Figure out how many existing progs we need to carry over to
         * the new array.
@@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
        if (!array)
                return -ENOMEM;
+       new = array->items;
 
        /* Fill in the new prog array */
        if (carry_prog_cnt) {
                existing = old_array->items;
-               for (; existing->prog; existing++)
-                       if (existing->prog != exclude_prog &&
-                           existing->prog != &dummy_bpf_prog.prog) {
-                               array->items[new_prog_idx++].prog =
-                                       existing->prog;
-                       }
+               for (; existing->prog; existing++) {
+                       if (existing->prog == exclude_prog ||
+                           existing->prog == &dummy_bpf_prog.prog)
+                               continue;
+
+                       new->prog = existing->prog;
+                       new->bpf_cookie = existing->bpf_cookie;
+                       new++;
+               }
        }
-       if (include_prog)
-               array->items[new_prog_idx++].prog = include_prog;
-       array->items[new_prog_idx].prog = NULL;
+       if (include_prog) {
+               new->prog = include_prog;
+               new->bpf_cookie = bpf_cookie;
+               new++;
+       }
+       new->prog = NULL;
        *new_array = array;
        return 0;
 }
index 04010a5..9aabf84 100644 (file)
@@ -918,6 +918,20 @@ fmt_str:
                        num_spec++;
 
                        continue;
+               } else if (fmt[i] == 'c') {
+                       if (!tmp_buf)
+                               goto nocopy_fmt;
+
+                       if (tmp_buf_end == tmp_buf) {
+                               err = -ENOSPC;
+                               goto out;
+                       }
+
+                       *tmp_buf = raw_args[num_spec];
+                       tmp_buf++;
+                       num_spec++;
+
+                       continue;
                }
 
                sizeof_cur_arg = sizeof(int);
@@ -1318,10 +1332,12 @@ out:
 }
 
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
@@ -1403,6 +1419,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                return bpf_get_trace_printk_proto();
        case BPF_FUNC_get_current_task:
                return &bpf_get_current_task_proto;
+       case BPF_FUNC_get_current_task_btf:
+               return &bpf_get_current_task_btf_proto;
        case BPF_FUNC_probe_read_user:
                return &bpf_probe_read_user_proto;
        case BPF_FUNC_probe_read_kernel:
@@ -1417,6 +1435,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                return &bpf_snprintf_btf_proto;
        case BPF_FUNC_snprintf:
                return &bpf_snprintf_proto;
+       case BPF_FUNC_task_pt_regs:
+               return &bpf_task_pt_regs_proto;
        default:
                return NULL;
        }
index 6fbc2ab..e8eefdf 100644 (file)
@@ -530,14 +530,12 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
        return res;
 }
 
-BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
-
 const struct bpf_func_proto bpf_get_task_stack_proto = {
        .func           = bpf_get_task_stack,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID,
-       .arg1_btf_id    = &bpf_get_task_stack_btf_ids[0],
+       .arg1_btf_id    = &btf_task_struct_ids[0],
        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
index 9a2068e..4e50c0b 100644 (file)
@@ -1013,7 +1013,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 {
        if (key_size)
-               return memdup_user(ukey, key_size);
+               return vmemdup_user(ukey, key_size);
 
        if (ukey)
                return ERR_PTR(-EINVAL);
@@ -1024,7 +1024,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
 {
        if (key_size)
-               return memdup_bpfptr(ukey, key_size);
+               return kvmemdup_bpfptr(ukey, key_size);
 
        if (!bpfptr_is_null(ukey))
                return ERR_PTR(-EINVAL);
@@ -1076,7 +1076,7 @@ static int map_lookup_elem(union bpf_attr *attr)
        value_size = bpf_map_value_size(map);
 
        err = -ENOMEM;
-       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+       value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;
 
@@ -1091,9 +1091,9 @@ static int map_lookup_elem(union bpf_attr *attr)
        err = 0;
 
 free_value:
-       kfree(value);
+       kvfree(value);
 free_key:
-       kfree(key);
+       kvfree(key);
 err_put:
        fdput(f);
        return err;
@@ -1137,16 +1137,10 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
                goto err_put;
        }
 
-       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-           map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-           map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-           map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-               value_size = round_up(map->value_size, 8) * num_possible_cpus();
-       else
-               value_size = map->value_size;
+       value_size = bpf_map_value_size(map);
 
        err = -ENOMEM;
-       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+       value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;
 
@@ -1157,9 +1151,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
        err = bpf_map_update_value(map, f, key, value, attr->flags);
 
 free_value:
-       kfree(value);
+       kvfree(value);
 free_key:
-       kfree(key);
+       kvfree(key);
 err_put:
        fdput(f);
        return err;
@@ -1211,7 +1205,7 @@ static int map_delete_elem(union bpf_attr *attr)
        bpf_enable_instrumentation();
        maybe_wait_bpf_programs(map);
 out:
-       kfree(key);
+       kvfree(key);
 err_put:
        fdput(f);
        return err;
@@ -1253,7 +1247,7 @@ static int map_get_next_key(union bpf_attr *attr)
        }
 
        err = -ENOMEM;
-       next_key = kmalloc(map->key_size, GFP_USER);
+       next_key = kvmalloc(map->key_size, GFP_USER);
        if (!next_key)
                goto free_key;
 
@@ -1276,9 +1270,9 @@ out:
        err = 0;
 
 free_next_key:
-       kfree(next_key);
+       kvfree(next_key);
 free_key:
-       kfree(key);
+       kvfree(key);
 err_put:
        fdput(f);
        return err;
@@ -1305,7 +1299,7 @@ int generic_map_delete_batch(struct bpf_map *map,
        if (!max_count)
                return 0;
 
-       key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+       key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!key)
                return -ENOMEM;
 
@@ -1332,7 +1326,7 @@ int generic_map_delete_batch(struct bpf_map *map,
        if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
                err = -EFAULT;
 
-       kfree(key);
+       kvfree(key);
        return err;
 }
 
@@ -1363,13 +1357,13 @@ int generic_map_update_batch(struct bpf_map *map,
        if (!max_count)
                return 0;
 
-       key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+       key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!key)
                return -ENOMEM;
 
-       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+       value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value) {
-               kfree(key);
+               kvfree(key);
                return -ENOMEM;
        }
 
@@ -1390,8 +1384,8 @@ int generic_map_update_batch(struct bpf_map *map,
        if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
                err = -EFAULT;
 
-       kfree(value);
-       kfree(key);
+       kvfree(value);
+       kvfree(key);
        return err;
 }
 
@@ -1425,13 +1419,13 @@ int generic_map_lookup_batch(struct bpf_map *map,
        if (put_user(0, &uattr->batch.count))
                return -EFAULT;
 
-       buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+       buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!buf_prevkey)
                return -ENOMEM;
 
-       buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+       buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
        if (!buf) {
-               kfree(buf_prevkey);
+               kvfree(buf_prevkey);
                return -ENOMEM;
        }
 
@@ -1491,8 +1485,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
                err = -EFAULT;
 
 free_buf:
-       kfree(buf_prevkey);
-       kfree(buf);
+       kvfree(buf_prevkey);
+       kvfree(buf);
        return err;
 }
 
@@ -1547,7 +1541,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
        value_size = bpf_map_value_size(map);
 
        err = -ENOMEM;
-       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+       value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;
 
@@ -1579,9 +1573,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
        err = 0;
 
 free_value:
-       kfree(value);
+       kvfree(value);
 free_key:
-       kfree(key);
+       kvfree(key);
 err_put:
        fdput(f);
        return err;
@@ -2906,6 +2900,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
        .fill_link_info = bpf_raw_tp_link_fill_link_info,
 };
 
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+       struct bpf_link link;
+       struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+       struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+       struct perf_event *event = perf_link->perf_file->private_data;
+
+       perf_event_free_bpf_prog(event);
+       fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+       struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+       kfree(perf_link);
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+       .release = bpf_perf_link_release,
+       .dealloc = bpf_perf_link_dealloc,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+       struct bpf_link_primer link_primer;
+       struct bpf_perf_link *link;
+       struct perf_event *event;
+       struct file *perf_file;
+       int err;
+
+       if (attr->link_create.flags)
+               return -EINVAL;
+
+       perf_file = perf_event_get(attr->link_create.target_fd);
+       if (IS_ERR(perf_file))
+               return PTR_ERR(perf_file);
+
+       link = kzalloc(sizeof(*link), GFP_USER);
+       if (!link) {
+               err = -ENOMEM;
+               goto out_put_file;
+       }
+       bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+       link->perf_file = perf_file;
+
+       err = bpf_link_prime(&link->link, &link_primer);
+       if (err) {
+               kfree(link);
+               goto out_put_file;
+       }
+
+       event = perf_file->private_data;
+       err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
+       if (err) {
+               bpf_link_cleanup(&link_primer);
+               goto out_put_file;
+       }
+       /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+       bpf_prog_inc(prog);
+
+       return bpf_link_settle(&link_primer);
+
+out_put_file:
+       fput(perf_file);
+       return err;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
@@ -4147,15 +4214,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
        if (ret)
                goto out;
 
-       if (prog->type == BPF_PROG_TYPE_EXT) {
+       switch (prog->type) {
+       case BPF_PROG_TYPE_EXT:
                ret = tracing_bpf_link_attach(attr, uattr, prog);
                goto out;
-       }
-
-       ptype = attach_type_to_prog_type(attr->link_create.attach_type);
-       if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
-               ret = -EINVAL;
-               goto out;
+       case BPF_PROG_TYPE_PERF_EVENT:
+       case BPF_PROG_TYPE_KPROBE:
+       case BPF_PROG_TYPE_TRACEPOINT:
+               if (attr->link_create.attach_type != BPF_PERF_EVENT) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ptype = prog->type;
+               break;
+       default:
+               ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+               if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               break;
        }
 
        switch (ptype) {
@@ -4180,6 +4258,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
                ret = bpf_xdp_link_attach(attr, prog);
                break;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+       case BPF_PROG_TYPE_PERF_EVENT:
+       case BPF_PROG_TYPE_TRACEPOINT:
+       case BPF_PROG_TYPE_KPROBE:
+               ret = bpf_perf_link_attach(attr, prog);
+               break;
+#endif
        default:
                ret = -EINVAL;
        }
index b68cb5d..b48750b 100644 (file)
@@ -525,7 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
 };
 
 BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
 BTF_ID(struct, vm_area_struct)
 
@@ -591,19 +590,19 @@ static int __init task_iter_init(void)
 {
        int ret;
 
-       task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+       task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
        ret = bpf_iter_reg_target(&task_reg_info);
        if (ret)
                return ret;
 
-       task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
-       task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+       task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+       task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
        ret =  bpf_iter_reg_target(&task_file_reg_info);
        if (ret)
                return ret;
 
-       task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
-       task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+       task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+       task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
        return bpf_iter_reg_target(&task_vma_reg_info);
 }
 late_initcall(task_iter_init);
index b2535ac..fe1e857 100644 (file)
@@ -548,7 +548,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
        u64_stats_update_end(&stats->syncp);
 }
 
-/* The logic is similar to BPF_PROG_RUN, but with an explicit
+/* The logic is similar to bpf_prog_run(), but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
  * call __bpf_prog_enter
index 9134aed..047ac4b 100644 (file)
@@ -12300,6 +12300,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                if (is_narrower_load && size < target_size) {
                        u8 shift = bpf_ctx_narrow_access_offset(
                                off, size, size_default) * 8;
+                       if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+                               verbose(env, "bpf verifier narrow ctx load misconfigured\n");
+                               return -EINVAL;
+                       }
                        if (ctx_field_size <= 4) {
                                if (shift)
                                        insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
@@ -12388,7 +12392,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
                subprog_end = env->subprog_info[i + 1].start;
 
                len = subprog_end - subprog_start;
-               /* BPF_PROG_RUN doesn't call subprogs directly,
+               /* bpf_prog_run() doesn't call subprogs directly,
                 * hence main prog stats include the runtime of subprogs.
                 * subprogs don't have IDs and not reachable via prog_get_next_id
                 * func[i]->stats will never be accessed and stays NULL
index 1cb1f9b..011cc50 100644 (file)
@@ -4697,7 +4697,6 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);
 
@@ -5637,7 +5635,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
                return perf_event_set_filter(event, (void __user *)arg);
 
        case PERF_EVENT_IOC_SET_BPF:
-               return perf_event_set_bpf_prog(event, arg);
+       {
+               struct bpf_prog *prog;
+               int err;
+
+               prog = bpf_prog_get(arg);
+               if (IS_ERR(prog))
+                       return PTR_ERR(prog);
+
+               err = perf_event_set_bpf_prog(event, prog, 0);
+               if (err) {
+                       bpf_prog_put(prog);
+                       return err;
+               }
+
+               return 0;
+       }
 
        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;
@@ -9907,13 +9920,16 @@ static void bpf_overflow_handler(struct perf_event *event,
                .data = data,
                .event = event,
        };
+       struct bpf_prog *prog;
        int ret = 0;
 
        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
-       ret = BPF_PROG_RUN(event->prog, &ctx);
+       prog = READ_ONCE(event->prog);
+       if (prog)
+               ret = bpf_prog_run(prog, &ctx);
        rcu_read_unlock();
 out:
        __this_cpu_dec(bpf_prog_active);
@@ -9923,10 +9939,10 @@ out:
        event->orig_overflow_handler(event, data, regs);
 }
 
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+                                     struct bpf_prog *prog,
+                                     u64 bpf_cookie)
 {
-       struct bpf_prog *prog;
-
        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;
@@ -9934,9 +9950,8 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
        if (event->prog)
                return -EEXIST;
 
-       prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
-       if (IS_ERR(prog))
-               return PTR_ERR(prog);
+       if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+               return -EINVAL;
 
        if (event->attr.precise_ip &&
            prog->call_get_stack &&
@@ -9952,11 +9967,11 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
-               bpf_prog_put(prog);
                return -EPROTO;
        }
 
        event->prog = prog;
+       event->bpf_cookie = bpf_cookie;
        event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
        WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
        return 0;
@@ -9974,7 +9989,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
        bpf_prog_put(prog);
 }
 #else
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+                                     struct bpf_prog *prog,
+                                     u64 bpf_cookie)
 {
        return -EOPNOTSUPP;
 }
@@ -10002,14 +10019,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
        return false;
 }
 
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+                           u64 bpf_cookie)
 {
        bool is_kprobe, is_tracepoint, is_syscall_tp;
-       struct bpf_prog *prog;
-       int ret;
 
        if (!perf_event_is_tracing(event))
-               return perf_event_set_bpf_handler(event, prog_fd);
+               return perf_event_set_bpf_handler(event, prog, bpf_cookie);
 
        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
@@ -10018,41 +10034,27 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;
 
-       prog = bpf_prog_get(prog_fd);
-       if (IS_ERR(prog))
-               return PTR_ERR(prog);
-
        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
-           (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
-               /* valid fd, but invalid bpf program type */
-               bpf_prog_put(prog);
+           (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                return -EINVAL;
-       }
 
        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override &&
-           !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
-               bpf_prog_put(prog);
+           !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
                return -EINVAL;
-       }
 
        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);
 
-               if (prog->aux->max_ctx_offset > off) {
-                       bpf_prog_put(prog);
+               if (prog->aux->max_ctx_offset > off)
                        return -EACCES;
-               }
        }
 
-       ret = perf_event_attach_bpf_prog(event, prog);
-       if (ret)
-               bpf_prog_put(prog);
-       return ret;
+       return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
 }
 
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
 {
        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
@@ -10071,12 +10073,13 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+                           u64 bpf_cookie)
 {
        return -ENOENT;
 }
 
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
 {
 }
 #endif /* CONFIG_EVENT_TRACING */
index 0da94e1..8e2eb95 100644 (file)
@@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
         * out of events when it was updated in between this and the
         * rcu_dereference() which is accepted risk.
         */
-       ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
 
  out:
        __this_cpu_dec(bpf_prog_active);
@@ -714,13 +714,28 @@ BPF_CALL_0(bpf_get_current_task_btf)
        return (unsigned long) current;
 }
 
-BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct)
-
-static const struct bpf_func_proto bpf_get_current_task_btf_proto = {
+const struct bpf_func_proto bpf_get_current_task_btf_proto = {
        .func           = bpf_get_current_task_btf,
        .gpl_only       = true,
        .ret_type       = RET_PTR_TO_BTF_ID,
-       .ret_btf_id     = &bpf_get_current_btf_ids[0],
+       .ret_btf_id     = &btf_task_struct_ids[0],
+};
+
+BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
+{
+       return (unsigned long) task_pt_regs(task);
+}
+
+BTF_ID_LIST(bpf_task_pt_regs_ids)
+BTF_ID(struct, pt_regs)
+
+const struct bpf_func_proto bpf_task_pt_regs_proto = {
+       .func           = bpf_task_pt_regs,
+       .gpl_only       = true,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg1_btf_id    = &btf_task_struct_ids[0],
+       .ret_type       = RET_PTR_TO_BTF_ID,
+       .ret_btf_id     = &bpf_task_pt_regs_ids[0],
 };
 
 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
@@ -975,7 +990,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
        .arg1_type      = ARG_PTR_TO_CTX,
 };
 
-const struct bpf_func_proto *
+BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
+{
+       struct bpf_trace_run_ctx *run_ctx;
+
+       run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+       return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
+       .func           = bpf_get_attach_cookie_trace,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
+{
+       return ctx->event->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
+       .func           = bpf_get_attach_cookie_pe,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+};
+
+static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
        switch (func_id) {
@@ -1005,6 +1047,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_get_current_task_proto;
        case BPF_FUNC_get_current_task_btf:
                return &bpf_get_current_task_btf_proto;
+       case BPF_FUNC_task_pt_regs:
+               return &bpf_task_pt_regs_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_get_current_comm:
@@ -1109,6 +1153,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
        case BPF_FUNC_get_func_ip:
                return &bpf_get_func_ip_proto_kprobe;
+       case BPF_FUNC_get_attach_cookie:
+               return &bpf_get_attach_cookie_proto_trace;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
@@ -1219,6 +1265,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_get_stackid_proto_tp;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_tp;
+       case BPF_FUNC_get_attach_cookie:
+               return &bpf_get_attach_cookie_proto_trace;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
@@ -1326,6 +1374,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_perf_prog_read_value_proto;
        case BPF_FUNC_read_branch_records:
                return &bpf_read_branch_records_proto;
+       case BPF_FUNC_get_attach_cookie:
+               return &bpf_get_attach_cookie_proto_pe;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
@@ -1675,7 +1725,8 @@ static DEFINE_MUTEX(bpf_event_mutex);
 #define BPF_TRACE_MAX_PROGS 64
 
 int perf_event_attach_bpf_prog(struct perf_event *event,
-                              struct bpf_prog *prog)
+                              struct bpf_prog *prog,
+                              u64 bpf_cookie)
 {
        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
@@ -1702,12 +1753,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
                goto unlock;
        }
 
-       ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+       ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
        if (ret < 0)
                goto unlock;
 
        /* set the new array to event->tp_event and set event->prog */
        event->prog = prog;
+       event->bpf_cookie = bpf_cookie;
        rcu_assign_pointer(event->tp_event->prog_array, new_array);
        bpf_prog_array_free(old_array);
 
@@ -1728,7 +1780,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
                goto unlock;
 
        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
-       ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+       ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
        if (ret == -ENOENT)
                goto unlock;
        if (ret < 0) {
@@ -1816,7 +1868,7 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
        cant_sleep();
        rcu_read_lock();
-       (void) BPF_PROG_RUN(prog, args);
+       (void) bpf_prog_run(prog, args);
        rcu_read_unlock();
 }
 
index 44d8197..830a18e 100644 (file)
@@ -5163,7 +5163,7 @@ static struct bpf_test tests[] = {
                { { 0, -1 } }
        },
        {
-               "ALU64_ARSH_K: Zero shoft",
+               "ALU64_ARSH_K: Zero shift",
                .u.insns_int = {
                        BPF_LD_IMM64(R0, 0x8123456789abcdefLL),
                        BPF_ALU64_IMM(BPF_ARSH, R0, 0),
@@ -8616,7 +8616,7 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
        start = ktime_get_ns();
 
        for (i = 0; i < runs; i++)
-               ret = BPF_PROG_RUN(fp, data);
+               ret = bpf_prog_run(fp, data);
 
        finish = ktime_get_ns();
        migrate_enable();
index 4b855af..2eb0e55 100644 (file)
@@ -116,7 +116,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
                if (xdp)
                        *retval = bpf_prog_run_xdp(prog, ctx);
                else
-                       *retval = BPF_PROG_RUN(prog, ctx);
+                       *retval = bpf_prog_run(prog, ctx);
        } while (bpf_test_timer_continue(&t, repeat, &ret, time));
        bpf_reset_run_ctx(old_ctx);
        bpf_test_timer_leave(&t);
@@ -327,7 +327,7 @@ __bpf_prog_test_run_raw_tp(void *data)
        struct bpf_raw_tp_test_run_info *info = data;
 
        rcu_read_lock();
-       info->retval = BPF_PROG_RUN(info->prog, info->ctx);
+       info->retval = bpf_prog_run(info->prog, info->ctx);
        rcu_read_unlock();
 }
 
@@ -989,7 +989,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
        bpf_test_timer_enter(&t);
        do {
                ctx.selected_sk = NULL;
-               retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN);
+               retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run);
        } while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
        bpf_test_timer_leave(&t);
 
index 3aca07c..2e32cee 100644 (file)
@@ -114,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  * Run the eBPF program and then cut skb->data to correct size returned by
  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
  * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
@@ -4676,6 +4676,30 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
 };
 
+BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+       return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
+       .func           = bpf_get_netns_cookie_sock_ops,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
+{
+       return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
+       .func           = bpf_get_netns_cookie_sk_msg,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
+};
+
 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
 {
        struct sock *sk = sk_to_full_sk(skb->sk);
@@ -5027,6 +5051,12 @@ err_clear:
 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
 {
+       if (level == SOL_TCP && optname == TCP_CONGESTION) {
+               if (optlen >= sizeof("cdg") - 1 &&
+                   !strncmp("cdg", optval, optlen))
+                       return -ENOTSUPP;
+       }
+
        return _bpf_setsockopt(sk, level, optname, optval, optlen);
 }
 
@@ -7491,6 +7521,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
+       case BPF_FUNC_get_netns_cookie:
+               return &bpf_get_netns_cookie_sock_ops_proto;
 #ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
@@ -7537,6 +7569,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
+       case BPF_FUNC_get_netns_cookie:
+               return &bpf_get_netns_cookie_sk_msg_proto;
 #ifdef CONFIG_CGROUPS
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
@@ -10115,7 +10149,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
        enum sk_action action;
 
        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
-       action = BPF_PROG_RUN(prog, &reuse_kern);
+       action = bpf_prog_run(prog, &reuse_kern);
 
        if (action == SK_PASS)
                return reuse_kern.selected_sk;
index e33fde0..dd4cf01 100644 (file)
@@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly;
 
 unsigned int ptp_classify_raw(const struct sk_buff *skb)
 {
-       return BPF_PROG_RUN(ptp_insns, skb);
+       return bpf_prog_run(ptp_insns, skb);
 }
 EXPORT_SYMBOL_GPL(ptp_classify_raw);
 
index ae5fa43..e252b8e 100644 (file)
@@ -1494,6 +1494,7 @@ void sock_map_unhash(struct sock *sk)
        rcu_read_unlock();
        saved_unhash(sk);
 }
+EXPORT_SYMBOL_GPL(sock_map_unhash);
 
 void sock_map_close(struct sock *sk, long timeout)
 {
index 0e4d758..1d816a5 100644 (file)
@@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
         * changes context in a wrong way it will be caught.
         */
        err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
-                                                BPF_CGROUP_INET4_BIND, &flags);
+                                                CGROUP_INET4_BIND, &flags);
        if (err)
                return err;
 
@@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                sin->sin_port = inet->inet_dport;
                sin->sin_addr.s_addr = inet->inet_daddr;
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           BPF_CGROUP_INET4_GETPEERNAME,
+                                           CGROUP_INET4_GETPEERNAME,
                                            NULL);
        } else {
                __be32 addr = inet->inet_rcv_saddr;
@@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           BPF_CGROUP_INET4_GETSOCKNAME,
+                                           CGROUP_INET4_GETSOCKNAME,
                                            NULL);
        }
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
index 9e41eff..0dcee9d 100644 (file)
@@ -10,6 +10,9 @@
 #include <net/tcp.h>
 #include <net/bpf_sk_storage.h>
 
+/* "extern" is to avoid sparse warning.  It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
 static u32 optional_ops[] = {
        offsetof(struct tcp_congestion_ops, init),
        offsetof(struct tcp_congestion_ops, release),
@@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
+static u32 prog_ops_moff(const struct bpf_prog *prog)
+{
+       const struct btf_member *m;
+       const struct btf_type *t;
+       u32 midx;
+
+       midx = prog->expected_attach_type;
+       t = bpf_tcp_congestion_ops.type;
+       m = &btf_type_member(t)[midx];
+
+       return btf_member_bit_offset(t, m) / 8;
+}
+
 static const struct bpf_func_proto *
 bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
                          const struct bpf_prog *prog)
@@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
+       case BPF_FUNC_setsockopt:
+               /* Does not allow release() to call setsockopt.
+                * release() is called when the current bpf-tcp-cc
+                * is retiring.  It is not allowed to call
+                * setsockopt() to make further changes which
+                * may potentially allocate new resources.
+                */
+               if (prog_ops_moff(prog) !=
+                   offsetof(struct tcp_congestion_ops, release))
+                       return &bpf_sk_setsockopt_proto;
+               return NULL;
+       case BPF_FUNC_getsockopt:
+               /* Since get/setsockopt is usually expected to
+                * be available together, disable getsockopt for
+                * release also to avoid usage surprise.
+                * The bpf-tcp-cc already has a more powerful way
+                * to read tcp_sock from the PTR_TO_BTF_ID.
+                */
+               if (prog_ops_moff(prog) !=
+                   offsetof(struct tcp_congestion_ops, release))
+                       return &bpf_sk_getsockopt_proto;
+               return NULL;
        default:
                return bpf_base_func_proto(func_id);
        }
@@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata)
        tcp_unregister_congestion_control(kdata);
 }
 
-/* Avoid sparse warning.  It is only used in bpf_struct_ops.c. */
-extern struct bpf_struct_ops bpf_tcp_congestion_ops;
-
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
        .verifier_ops = &bpf_tcp_ca_verifier_ops,
        .reg = bpf_tcp_ca_reg,
index 1a742b7..8851c94 100644 (file)
@@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
                rcu_read_unlock();
        }
 
-       if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
+       if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
                                            (struct sockaddr *)usin, &ipc.addr);
                if (err)
index d92c90d..b5878bb 100644 (file)
@@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
         * changes context in a wrong way it will be caught.
         */
        err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
-                                                BPF_CGROUP_INET6_BIND, &flags);
+                                                CGROUP_INET6_BIND, &flags);
        if (err)
                return err;
 
@@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                if (np->sndflow)
                        sin->sin6_flowinfo = np->flow_label;
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           BPF_CGROUP_INET6_GETPEERNAME,
+                                           CGROUP_INET6_GETPEERNAME,
                                            NULL);
        } else {
                if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                        sin->sin6_addr = sk->sk_v6_rcv_saddr;
                sin->sin6_port = inet->inet_sport;
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           BPF_CGROUP_INET6_GETSOCKNAME,
+                                           CGROUP_INET6_GETSOCKNAME,
                                            NULL);
        }
        sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
index c5e15e9..ea53847 100644 (file)
@@ -1475,7 +1475,7 @@ do_udp_sendmsg:
                fl6.saddr = np->saddr;
        fl6.fl6_sport = inet->inet_sport;
 
-       if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) {
+       if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
                                           (struct sockaddr *)sin6, &fl6.saddr);
                if (err)
index 13cf3f9..849ac55 100644 (file)
@@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
        const struct xt_bpf_info *info = par->matchinfo;
 
-       return BPF_PROG_RUN(info->filter, skb);
+       return bpf_prog_run(info->filter, skb);
 }
 
 static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
index 040807a..5c36013 100644 (file)
@@ -47,11 +47,11 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
        if (at_ingress) {
                __skb_push(skb, skb->mac_len);
                bpf_compute_data_pointers(skb);
-               filter_res = BPF_PROG_RUN(filter, skb);
+               filter_res = bpf_prog_run(filter, skb);
                __skb_pull(skb, skb->mac_len);
        } else {
                bpf_compute_data_pointers(skb);
-               filter_res = BPF_PROG_RUN(filter, skb);
+               filter_res = bpf_prog_run(filter, skb);
        }
        if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
                skb_orphan(skb);
index 3b472ba..df19a84 100644 (file)
@@ -96,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
                        /* It is safe to push/pull even if skb_shared() */
                        __skb_push(skb, skb->mac_len);
                        bpf_compute_data_pointers(skb);
-                       filter_res = BPF_PROG_RUN(prog->filter, skb);
+                       filter_res = bpf_prog_run(prog->filter, skb);
                        __skb_pull(skb, skb->mac_len);
                } else {
                        bpf_compute_data_pointers(skb);
-                       filter_res = BPF_PROG_RUN(prog->filter, skb);
+                       filter_res = bpf_prog_run(prog->filter, skb);
                }
 
                if (prog->exts_integrated) {
index 4cf0b1c..7cad52b 100644 (file)
 #include <linux/security.h>
 #include <linux/freezer.h>
 #include <linux/file.h>
+#include <linux/btf_ids.h>
 
 #include "scm.h"
 
@@ -678,6 +679,8 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
                          sk_read_actor_t recv_actor);
+static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
+                                sk_read_actor_t recv_actor);
 static int unix_dgram_connect(struct socket *, struct sockaddr *,
                              int, int);
 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -731,6 +734,7 @@ static const struct proto_ops unix_stream_ops = {
        .shutdown =     unix_shutdown,
        .sendmsg =      unix_stream_sendmsg,
        .recvmsg =      unix_stream_recvmsg,
+       .read_sock =    unix_stream_read_sock,
        .mmap =         sock_no_mmap,
        .sendpage =     unix_stream_sendpage,
        .splice_read =  unix_stream_splice_read,
@@ -794,17 +798,35 @@ static void unix_close(struct sock *sk, long timeout)
         */
 }
 
-struct proto unix_proto = {
-       .name                   = "UNIX",
+static void unix_unhash(struct sock *sk)
+{
+       /* Nothing to do here, unix socket does not need a ->unhash().
+        * This is merely for sockmap.
+        */
+}
+
+struct proto unix_dgram_proto = {
+       .name                   = "UNIX-DGRAM",
        .owner                  = THIS_MODULE,
        .obj_size               = sizeof(struct unix_sock),
        .close                  = unix_close,
 #ifdef CONFIG_BPF_SYSCALL
-       .psock_update_sk_prot   = unix_bpf_update_proto,
+       .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 #endif
 };
 
-static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
+struct proto unix_stream_proto = {
+       .name                   = "UNIX-STREAM",
+       .owner                  = THIS_MODULE,
+       .obj_size               = sizeof(struct unix_sock),
+       .close                  = unix_close,
+       .unhash                 = unix_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+       .psock_update_sk_prot   = unix_stream_bpf_update_proto,
+#endif
+};
+
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 {
        struct sock *sk = NULL;
        struct unix_sock *u;
@@ -813,7 +835,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
        if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;
 
-       sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
+       if (type == SOCK_STREAM)
+               sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
+       else /*dgram and  seqpacket */
+               sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
+
        if (!sk)
                goto out;
 
@@ -875,7 +901,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
                return -ESOCKTNOSUPPORT;
        }
 
-       return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
+       return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock)
@@ -1289,7 +1315,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        err = -ENOMEM;
 
        /* create new sock for complete connection */
-       newsk = unix_create1(sock_net(sk), NULL, 0);
+       newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
        if (newsk == NULL)
                goto out;
 
@@ -2326,8 +2352,10 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
        struct sock *sk = sock->sk;
 
 #ifdef CONFIG_BPF_SYSCALL
-       if (sk->sk_prot != &unix_proto)
-               return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+       const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+       if (prot != &unix_dgram_proto)
+               return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
                                            flags & ~MSG_DONTWAIT, NULL);
 #endif
        return __unix_dgram_recvmsg(sk, msg, size, flags);
@@ -2497,6 +2525,15 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 }
 #endif
 
+static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
+                                sk_read_actor_t recv_actor)
+{
+       if (unlikely(sk->sk_state != TCP_ESTABLISHED))
+               return -ENOTCONN;
+
+       return unix_read_sock(sk, desc, recv_actor);
+}
+
 static int unix_stream_read_generic(struct unix_stream_read_state *state,
                                    bool freezable)
 {
@@ -2722,6 +2759,20 @@ static int unix_stream_read_actor(struct sk_buff *skb,
        return ret ?: chunk;
 }
 
+int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
+                         size_t size, int flags)
+{
+       struct unix_stream_read_state state = {
+               .recv_actor = unix_stream_read_actor,
+               .socket = sk->sk_socket,
+               .msg = msg,
+               .size = size,
+               .flags = flags
+       };
+
+       return unix_stream_read_generic(&state, true);
+}
+
 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
                               size_t size, int flags)
 {
@@ -2733,6 +2784,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
                .flags = flags
        };
 
+#ifdef CONFIG_BPF_SYSCALL
+       struct sock *sk = sock->sk;
+       const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+       if (prot != &unix_stream_proto)
+               return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+                                           flags & ~MSG_DONTWAIT, NULL);
+#endif
        return unix_stream_read_generic(&state, true);
 }
 
@@ -2793,7 +2852,10 @@ static int unix_shutdown(struct socket *sock, int mode)
                (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
 
                int peer_mode = 0;
+               const struct proto *prot = READ_ONCE(other->sk_prot);
 
+               if (prot->unhash)
+                       prot->unhash(other);
                if (mode&RCV_SHUTDOWN)
                        peer_mode |= SEND_SHUTDOWN;
                if (mode&SEND_SHUTDOWN)
@@ -2802,10 +2864,12 @@ static int unix_shutdown(struct socket *sock, int mode)
                other->sk_shutdown |= peer_mode;
                unix_state_unlock(other);
                other->sk_state_change(other);
-               if (peer_mode == SHUTDOWN_MASK)
+               if (peer_mode == SHUTDOWN_MASK) {
                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
-               else if (peer_mode & RCV_SHUTDOWN)
+                       other->sk_state = TCP_CLOSE;
+               } else if (peer_mode & RCV_SHUTDOWN) {
                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
+               }
        }
        if (other)
                sock_put(other);
@@ -3150,6 +3214,64 @@ static const struct seq_operations unix_seq_ops = {
        .stop   = unix_seq_stop,
        .show   = unix_seq_show,
 };
+
+#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_iter__unix {
+       __bpf_md_ptr(struct bpf_iter_meta *, meta);
+       __bpf_md_ptr(struct unix_sock *, unix_sk);
+       uid_t uid __aligned(8);
+};
+
+static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+                             struct unix_sock *unix_sk, uid_t uid)
+{
+       struct bpf_iter__unix ctx;
+
+       meta->seq_num--;  /* skip SEQ_START_TOKEN */
+       ctx.meta = meta;
+       ctx.unix_sk = unix_sk;
+       ctx.uid = uid;
+       return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
+{
+       struct bpf_iter_meta meta;
+       struct bpf_prog *prog;
+       struct sock *sk = v;
+       uid_t uid;
+
+       if (v == SEQ_START_TOKEN)
+               return 0;
+
+       uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+       meta.seq = seq;
+       prog = bpf_iter_get_info(&meta, false);
+       return unix_prog_seq_show(prog, &meta, v, uid);
+}
+
+static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
+{
+       struct bpf_iter_meta meta;
+       struct bpf_prog *prog;
+
+       if (!v) {
+               meta.seq = seq;
+               prog = bpf_iter_get_info(&meta, true);
+               if (prog)
+                       (void)unix_prog_seq_show(prog, &meta, v, 0);
+       }
+
+       unix_seq_stop(seq, v);
+}
+
+static const struct seq_operations bpf_iter_unix_seq_ops = {
+       .start  = unix_seq_start,
+       .next   = unix_seq_next,
+       .stop   = bpf_iter_unix_seq_stop,
+       .show   = bpf_iter_unix_seq_show,
+};
+#endif
 #endif
 
 static const struct net_proto_family unix_family_ops = {
@@ -3190,13 +3312,48 @@ static struct pernet_operations unix_net_ops = {
        .exit = unix_net_exit,
 };
 
+#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
+                    struct unix_sock *unix_sk, uid_t uid)
+
+static const struct bpf_iter_seq_info unix_seq_info = {
+       .seq_ops                = &bpf_iter_unix_seq_ops,
+       .init_seq_private       = bpf_iter_init_seq_net,
+       .fini_seq_private       = bpf_iter_fini_seq_net,
+       .seq_priv_size          = sizeof(struct seq_net_private),
+};
+
+static struct bpf_iter_reg unix_reg_info = {
+       .target                 = "unix",
+       .ctx_arg_info_size      = 1,
+       .ctx_arg_info           = {
+               { offsetof(struct bpf_iter__unix, unix_sk),
+                 PTR_TO_BTF_ID_OR_NULL },
+       },
+       .seq_info               = &unix_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+       unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
+       if (bpf_iter_reg_target(&unix_reg_info))
+               pr_warn("Warning: could not register bpf iterator unix\n");
+}
+#endif
+
 static int __init af_unix_init(void)
 {
        int rc = -1;
 
        BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
 
-       rc = proto_register(&unix_proto, 1);
+       rc = proto_register(&unix_dgram_proto, 1);
+       if (rc != 0) {
+               pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
+               goto out;
+       }
+
+       rc = proto_register(&unix_stream_proto, 1);
        if (rc != 0) {
                pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
                goto out;
@@ -3205,6 +3362,11 @@ static int __init af_unix_init(void)
        sock_register(&unix_family_ops);
        register_pernet_subsys(&unix_net_ops);
        unix_bpf_build_proto();
+
+#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+       bpf_iter_register();
+#endif
+
 out:
        return rc;
 }
@@ -3212,7 +3374,8 @@ out:
 static void __exit af_unix_exit(void)
 {
        sock_unregister(PF_UNIX);
-       proto_unregister(&unix_proto);
+       proto_unregister(&unix_dgram_proto);
+       proto_unregister(&unix_stream_proto);
        unregister_pernet_subsys(&unix_net_ops);
 }
 
index 20f5357..b927e2b 100644 (file)
@@ -38,9 +38,18 @@ static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,
        return ret;
 }
 
-static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
-                                 size_t len, int nonblock, int flags,
-                                 int *addr_len)
+static int __unix_recvmsg(struct sock *sk, struct msghdr *msg,
+                         size_t len, int flags)
+{
+       if (sk->sk_type == SOCK_DGRAM)
+               return __unix_dgram_recvmsg(sk, msg, len, flags);
+       else
+               return __unix_stream_recvmsg(sk, msg, len, flags);
+}
+
+static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+                           size_t len, int nonblock, int flags,
+                           int *addr_len)
 {
        struct unix_sock *u = unix_sk(sk);
        struct sk_psock *psock;
@@ -48,14 +57,14 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
 
        psock = sk_psock_get(sk);
        if (unlikely(!psock))
-               return __unix_dgram_recvmsg(sk, msg, len, flags);
+               return __unix_recvmsg(sk, msg, len, flags);
 
        mutex_lock(&u->iolock);
        if (!skb_queue_empty(&sk->sk_receive_queue) &&
            sk_psock_queue_empty(psock)) {
                mutex_unlock(&u->iolock);
                sk_psock_put(sk, psock);
-               return __unix_dgram_recvmsg(sk, msg, len, flags);
+               return __unix_recvmsg(sk, msg, len, flags);
        }
 
 msg_bytes_ready:
@@ -71,7 +80,7 @@ msg_bytes_ready:
                                goto msg_bytes_ready;
                        mutex_unlock(&u->iolock);
                        sk_psock_put(sk, psock);
-                       return __unix_dgram_recvmsg(sk, msg, len, flags);
+                       return __unix_recvmsg(sk, msg, len, flags);
                }
                copied = -EAGAIN;
        }
@@ -80,30 +89,55 @@ msg_bytes_ready:
        return copied;
 }
 
-static struct proto *unix_prot_saved __read_mostly;
-static DEFINE_SPINLOCK(unix_prot_lock);
-static struct proto unix_bpf_prot;
+static struct proto *unix_dgram_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(unix_dgram_prot_lock);
+static struct proto unix_dgram_bpf_prot;
+
+static struct proto *unix_stream_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(unix_stream_prot_lock);
+static struct proto unix_stream_bpf_prot;
 
-static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
 {
        *prot        = *base;
        prot->close  = sock_map_close;
-       prot->recvmsg = unix_dgram_bpf_recvmsg;
+       prot->recvmsg = unix_bpf_recvmsg;
+}
+
+static void unix_stream_bpf_rebuild_protos(struct proto *prot,
+                                          const struct proto *base)
+{
+       *prot        = *base;
+       prot->close  = sock_map_close;
+       prot->recvmsg = unix_bpf_recvmsg;
+       prot->unhash  = sock_map_unhash;
+}
+
+static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops)
+{
+       if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) {
+               spin_lock_bh(&unix_dgram_prot_lock);
+               if (likely(ops != unix_dgram_prot_saved)) {
+                       unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops);
+                       smp_store_release(&unix_dgram_prot_saved, ops);
+               }
+               spin_unlock_bh(&unix_dgram_prot_lock);
+       }
 }
 
-static void unix_bpf_check_needs_rebuild(struct proto *ops)
+static void unix_stream_bpf_check_needs_rebuild(struct proto *ops)
 {
-       if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) {
-               spin_lock_bh(&unix_prot_lock);
-               if (likely(ops != unix_prot_saved)) {
-                       unix_bpf_rebuild_protos(&unix_bpf_prot, ops);
-                       smp_store_release(&unix_prot_saved, ops);
+       if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) {
+               spin_lock_bh(&unix_stream_prot_lock);
+               if (likely(ops != unix_stream_prot_saved)) {
+                       unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops);
+                       smp_store_release(&unix_stream_prot_saved, ops);
                }
-               spin_unlock_bh(&unix_prot_lock);
+               spin_unlock_bh(&unix_stream_prot_lock);
        }
 }
 
-int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
        if (sk->sk_type != SOCK_DGRAM)
                return -EOPNOTSUPP;
@@ -114,12 +148,27 @@ int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
                return 0;
        }
 
-       unix_bpf_check_needs_rebuild(psock->sk_proto);
-       WRITE_ONCE(sk->sk_prot, &unix_bpf_prot);
+       unix_dgram_bpf_check_needs_rebuild(psock->sk_proto);
+       WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot);
+       return 0;
+}
+
+int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+       if (restore) {
+               sk->sk_write_space = psock->saved_write_space;
+               WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+               return 0;
+       }
+
+       unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
+       WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot);
        return 0;
 }
 
 void __init unix_bpf_build_proto(void)
 {
-       unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto);
+       unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto);
+       unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto);
+
 }
index 036998d..4dc20be 100644 (file)
@@ -39,11 +39,6 @@ tprogs-y += lwt_len_hist
 tprogs-y += xdp_tx_iptunnel
 tprogs-y += test_map_in_map
 tprogs-y += per_socket_stats_example
-tprogs-y += xdp_redirect
-tprogs-y += xdp_redirect_map
-tprogs-y += xdp_redirect_map_multi
-tprogs-y += xdp_redirect_cpu
-tprogs-y += xdp_monitor
 tprogs-y += xdp_rxq_info
 tprogs-y += syscall_tp
 tprogs-y += cpustat
@@ -57,11 +52,18 @@ tprogs-y += xdp_sample_pkts
 tprogs-y += ibumad
 tprogs-y += hbm
 
+tprogs-y += xdp_redirect_cpu
+tprogs-y += xdp_redirect_map_multi
+tprogs-y += xdp_redirect_map
+tprogs-y += xdp_redirect
+tprogs-y += xdp_monitor
+
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
 
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
 TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
+XDP_SAMPLE := xdp_sample_user.o
 
 fds_example-objs := fds_example.o
 sockex1-objs := sockex1_user.o
@@ -98,11 +100,6 @@ lwt_len_hist-objs := lwt_len_hist_user.o
 xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o
 test_map_in_map-objs := test_map_in_map_user.o
 per_socket_stats_example-objs := cookie_uid_helper_example.o
-xdp_redirect-objs := xdp_redirect_user.o
-xdp_redirect_map-objs := xdp_redirect_map_user.o
-xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o
-xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
-xdp_monitor-objs := xdp_monitor_user.o
 xdp_rxq_info-objs := xdp_rxq_info_user.o
 syscall_tp-objs := syscall_tp_user.o
 cpustat-objs := cpustat_user.o
@@ -116,6 +113,12 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o
 ibumad-objs := ibumad_user.o
 hbm-objs := hbm.o $(CGROUP_HELPERS)
 
+xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o $(XDP_SAMPLE)
+xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE)
+xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE)
+xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE)
+xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE)
+
 # Tell kbuild to always build the programs
 always-y := $(tprogs-y)
 always-y += sockex1_kern.o
@@ -160,11 +163,6 @@ always-y += tcp_clamp_kern.o
 always-y += tcp_basertt_kern.o
 always-y += tcp_tos_reflect_kern.o
 always-y += tcp_dumpstats_kern.o
-always-y += xdp_redirect_kern.o
-always-y += xdp_redirect_map_kern.o
-always-y += xdp_redirect_map_multi_kern.o
-always-y += xdp_redirect_cpu_kern.o
-always-y += xdp_monitor_kern.o
 always-y += xdp_rxq_info_kern.o
 always-y += xdp2skb_meta_kern.o
 always-y += syscall_tp_kern.o
@@ -276,6 +274,11 @@ $(LIBBPF): FORCE
        $(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \
                LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O=
 
+BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL := $(BPFTOOLDIR)/bpftool
+$(BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)
+           $(MAKE) -C $(BPFTOOLDIR) srctree=$(BPF_SAMPLES_PATH)/../../
+
 $(obj)/syscall_nrs.h:  $(obj)/syscall_nrs.s FORCE
        $(call filechk,offsets,__SYSCALL_NRS_H__)
 
@@ -306,6 +309,12 @@ verify_target_bpf: verify_cmds
 $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
 $(src)/*.c: verify_target_bpf $(LIBBPF)
 
+$(obj)/xdp_redirect_cpu_user.o: $(obj)/xdp_redirect_cpu.skel.h
+$(obj)/xdp_redirect_map_multi_user.o: $(obj)/xdp_redirect_map_multi.skel.h
+$(obj)/xdp_redirect_map_user.o: $(obj)/xdp_redirect_map.skel.h
+$(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h
+$(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h
+
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 $(obj)/hbm.o: $(src)/hbm.h
@@ -313,6 +322,76 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 
 -include $(BPF_SAMPLES_PATH)/Makefile.target
 
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)                           \
+                    $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)    \
+                    ../../../../vmlinux                                \
+                    /sys/kernel/btf/vmlinux                            \
+                    /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL)
+ifeq ($(VMLINUX_H),)
+       $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+       $(Q)cp "$(VMLINUX_H)" $@
+endif
+
+clean-files += vmlinux.h
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+        | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/')
+endef
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
+
+$(obj)/xdp_redirect_cpu.bpf.o: $(obj)/xdp_sample.bpf.o
+$(obj)/xdp_redirect_map_multi.bpf.o: $(obj)/xdp_sample.bpf.o
+$(obj)/xdp_redirect_map.bpf.o: $(obj)/xdp_sample.bpf.o
+$(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o
+$(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o
+
+$(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h
+       @echo "  CLANG-BPF " $@
+       $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(SRCARCH) \
+               -Wno-compare-distinct-pointer-types -I$(srctree)/include \
+               -I$(srctree)/samples/bpf -I$(srctree)/tools/include \
+               -I$(srctree)/tools/lib $(CLANG_SYS_INCLUDES) \
+               -c $(filter %.bpf.c,$^) -o $@
+
+LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map_multi.skel.h \
+               xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h
+clean-files += $(LINKED_SKELS)
+
+xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o
+xdp_redirect_map_multi.skel.h-deps := xdp_redirect_map_multi.bpf.o xdp_sample.bpf.o
+xdp_redirect_map.skel.h-deps := xdp_redirect_map.bpf.o xdp_sample.bpf.o
+xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o
+xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o
+
+LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
+
+BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c))
+BPF_OBJS_LINKED := $(patsubst %.bpf.c,$(obj)/%.bpf.o, $(BPF_SRCS_LINKED))
+BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS))
+
+$(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL)
+       @echo "  BPF GEN-OBJ " $(@:.skel.h=)
+       $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps))
+       @echo "  BPF GEN-SKEL" $(@:.skel.h=)
+       $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@
+
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
 # useless for BPF samples.
index 7621f55..5a368af 100644 (file)
@@ -73,3 +73,14 @@ quiet_cmd_tprog-cobjs        = CC  $@
       cmd_tprog-cobjs  = $(CC) $(tprogc_flags) -c -o $@ $<
 $(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE
        $(call if_changed_dep,tprog-cobjs)
+
+# Override includes for xdp_sample_user.o because $(srctree)/usr/include in
+# TPROGS_CFLAGS causes conflicts
+XDP_SAMPLE_CFLAGS += -Wall -O2 -lm \
+                    -I./tools/include \
+                    -I./tools/include/uapi \
+                    -I./tools/lib \
+                    -I./tools/testing/selftests/bpf
+$(obj)/xdp_sample_user.o: $(src)/xdp_sample_user.c \
+       $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h
+       $(CC) $(XDP_SAMPLE_CFLAGS) -c -o $@ $<
index cc3bce8..5495880 100644 (file)
@@ -167,7 +167,7 @@ static void prog_load(void)
 static void prog_attach_iptables(char *file)
 {
        int ret;
-       char rules[100];
+       char rules[256];
 
        if (bpf_obj_pin(prog_fd, file))
                error(1, errno, "bpf_obj_pin");
@@ -175,8 +175,13 @@ static void prog_attach_iptables(char *file)
                printf("file path too long: %s\n", file);
                exit(1);
        }
-       sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT",
-               file);
+       ret = snprintf(rules, sizeof(rules),
+                      "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT",
+                      file);
+       if (ret < 0 || ret >= sizeof(rules)) {
+               printf("error constructing iptables command\n");
+               exit(1);
+       }
        ret = system(rules);
        if (ret < 0) {
                printf("iptables rule update failed: %d/n", WEXITSTATUS(ret));
index 14b7929..4866afd 100644 (file)
@@ -20,6 +20,7 @@
        })
 
 #define MINBLOCK_US    1
+#define MAX_ENTRIES    10000
 
 struct key_t {
        char waker[TASK_COMM_LEN];
@@ -32,14 +33,14 @@ struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __type(key, struct key_t);
        __type(value, u64);
-       __uint(max_entries, 10000);
+       __uint(max_entries, MAX_ENTRIES);
 } counts SEC(".maps");
 
 struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __type(key, u32);
        __type(value, u64);
-       __uint(max_entries, 10000);
+       __uint(max_entries, MAX_ENTRIES);
 } start SEC(".maps");
 
 struct wokeby_t {
@@ -51,14 +52,14 @@ struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __type(key, u32);
        __type(value, struct wokeby_t);
-       __uint(max_entries, 10000);
+       __uint(max_entries, MAX_ENTRIES);
 } wokeby SEC(".maps");
 
 struct {
        __uint(type, BPF_MAP_TYPE_STACK_TRACE);
        __uint(key_size, sizeof(u32));
        __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
-       __uint(max_entries, 10000);
+       __uint(max_entries, MAX_ENTRIES);
 } stackmap SEC(".maps");
 
 #define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
index cea3994..566e644 100644 (file)
@@ -32,7 +32,7 @@ static void print_old_objects(int fd)
        __u64 key, next_key;
        struct pair v;
 
-       key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
+       key = write(1, "\e[1;1H\e[2J", 11); /* clear screen */
 
        key = -1;
        while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
diff --git a/samples/bpf/xdp_monitor.bpf.c b/samples/bpf/xdp_monitor.bpf.c
new file mode 100644 (file)
index 0000000..cfb41e2
--- /dev/null
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * XDP monitor tool, based on tracepoints
+ */
+#include "xdp_sample.bpf.h"
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
deleted file mode 100644 (file)
index 5c955b8..0000000
+++ /dev/null
@@ -1,257 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0
- *  Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
- *
- * XDP monitor tool, based on tracepoints
- */
-#include <uapi/linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, u64);
-       __uint(max_entries, 2);
-       /* TODO: have entries for all possible errno's */
-} redirect_err_cnt SEC(".maps");
-
-#define XDP_UNKNOWN    XDP_REDIRECT + 1
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, u64);
-       __uint(max_entries, XDP_UNKNOWN + 1);
-} exception_cnt SEC(".maps");
-
-/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
- * Code in:                kernel/include/trace/events/xdp.h
- */
-struct xdp_redirect_ctx {
-       u64 __pad;              // First 8 bytes are not accessible by bpf code
-       int prog_id;            //      offset:8;  size:4; signed:1;
-       u32 act;                //      offset:12  size:4; signed:0;
-       int ifindex;            //      offset:16  size:4; signed:1;
-       int err;                //      offset:20  size:4; signed:1;
-       int to_ifindex;         //      offset:24  size:4; signed:1;
-       u32 map_id;             //      offset:28  size:4; signed:0;
-       int map_index;          //      offset:32  size:4; signed:1;
-};                             //      offset:36
-
-enum {
-       XDP_REDIRECT_SUCCESS = 0,
-       XDP_REDIRECT_ERROR = 1
-};
-
-static __always_inline
-int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
-{
-       u32 key = XDP_REDIRECT_ERROR;
-       int err = ctx->err;
-       u64 *cnt;
-
-       if (!err)
-               key = XDP_REDIRECT_SUCCESS;
-
-       cnt  = bpf_map_lookup_elem(&redirect_err_cnt, &key);
-       if (!cnt)
-               return 1;
-       *cnt += 1;
-
-       return 0; /* Indicate event was filtered (no further processing)*/
-       /*
-        * Returning 1 here would allow e.g. a perf-record tracepoint
-        * to see and record these events, but it doesn't work well
-        * in-practice as stopping perf-record also unload this
-        * bpf_prog.  Plus, there is additional overhead of doing so.
-        */
-}
-
-SEC("tracepoint/xdp/xdp_redirect_err")
-int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
-{
-       return xdp_redirect_collect_stat(ctx);
-}
-
-
-SEC("tracepoint/xdp/xdp_redirect_map_err")
-int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
-{
-       return xdp_redirect_collect_stat(ctx);
-}
-
-/* Likely unloaded when prog starts */
-SEC("tracepoint/xdp/xdp_redirect")
-int trace_xdp_redirect(struct xdp_redirect_ctx *ctx)
-{
-       return xdp_redirect_collect_stat(ctx);
-}
-
-/* Likely unloaded when prog starts */
-SEC("tracepoint/xdp/xdp_redirect_map")
-int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
-{
-       return xdp_redirect_collect_stat(ctx);
-}
-
-/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
- * Code in:                kernel/include/trace/events/xdp.h
- */
-struct xdp_exception_ctx {
-       u64 __pad;      // First 8 bytes are not accessible by bpf code
-       int prog_id;    //      offset:8;  size:4; signed:1;
-       u32 act;        //      offset:12; size:4; signed:0;
-       int ifindex;    //      offset:16; size:4; signed:1;
-};
-
-SEC("tracepoint/xdp/xdp_exception")
-int trace_xdp_exception(struct xdp_exception_ctx *ctx)
-{
-       u64 *cnt;
-       u32 key;
-
-       key = ctx->act;
-       if (key > XDP_REDIRECT)
-               key = XDP_UNKNOWN;
-
-       cnt = bpf_map_lookup_elem(&exception_cnt, &key);
-       if (!cnt)
-               return 1;
-       *cnt += 1;
-
-       return 0;
-}
-
-/* Common stats data record shared with _user.c */
-struct datarec {
-       u64 processed;
-       u64 dropped;
-       u64 info;
-       u64 err;
-};
-#define MAX_CPUS 64
-
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, MAX_CPUS);
-} cpumap_enqueue_cnt SEC(".maps");
-
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, 1);
-} cpumap_kthread_cnt SEC(".maps");
-
-/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
- * Code in:         kernel/include/trace/events/xdp.h
- */
-struct cpumap_enqueue_ctx {
-       u64 __pad;              // First 8 bytes are not accessible by bpf code
-       int map_id;             //      offset:8;  size:4; signed:1;
-       u32 act;                //      offset:12; size:4; signed:0;
-       int cpu;                //      offset:16; size:4; signed:1;
-       unsigned int drops;     //      offset:20; size:4; signed:0;
-       unsigned int processed; //      offset:24; size:4; signed:0;
-       int to_cpu;             //      offset:28; size:4; signed:1;
-};
-
-SEC("tracepoint/xdp/xdp_cpumap_enqueue")
-int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
-{
-       u32 to_cpu = ctx->to_cpu;
-       struct datarec *rec;
-
-       if (to_cpu >= MAX_CPUS)
-               return 1;
-
-       rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
-       if (!rec)
-               return 0;
-       rec->processed += ctx->processed;
-       rec->dropped   += ctx->drops;
-
-       /* Record bulk events, then userspace can calc average bulk size */
-       if (ctx->processed > 0)
-               rec->info += 1;
-
-       return 0;
-}
-
-/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
- * Code in:         kernel/include/trace/events/xdp.h
- */
-struct cpumap_kthread_ctx {
-       u64 __pad;              // First 8 bytes are not accessible by bpf code
-       int map_id;             //      offset:8;  size:4; signed:1;
-       u32 act;                //      offset:12; size:4; signed:0;
-       int cpu;                //      offset:16; size:4; signed:1;
-       unsigned int drops;     //      offset:20; size:4; signed:0;
-       unsigned int processed; //      offset:24; size:4; signed:0;
-       int sched;              //      offset:28; size:4; signed:1;
-};
-
-SEC("tracepoint/xdp/xdp_cpumap_kthread")
-int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
-{
-       struct datarec *rec;
-       u32 key = 0;
-
-       rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
-       if (!rec)
-               return 0;
-       rec->processed += ctx->processed;
-       rec->dropped   += ctx->drops;
-
-       /* Count times kthread yielded CPU via schedule call */
-       if (ctx->sched)
-               rec->info++;
-
-       return 0;
-}
-
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, 1);
-} devmap_xmit_cnt SEC(".maps");
-
-/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
- * Code in:         kernel/include/trace/events/xdp.h
- */
-struct devmap_xmit_ctx {
-       u64 __pad;              // First 8 bytes are not accessible by bpf code
-       int from_ifindex;       //      offset:8;  size:4; signed:1;
-       u32 act;                //      offset:12; size:4; signed:0;
-       int to_ifindex;         //      offset:16; size:4; signed:1;
-       int drops;              //      offset:20; size:4; signed:1;
-       int sent;               //      offset:24; size:4; signed:1;
-       int err;                //      offset:28; size:4; signed:1;
-};
-
-SEC("tracepoint/xdp/xdp_devmap_xmit")
-int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
-{
-       struct datarec *rec;
-       u32 key = 0;
-
-       rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
-       if (!rec)
-               return 0;
-       rec->processed += ctx->sent;
-       rec->dropped   += ctx->drops;
-
-       /* Record bulk events, then userspace can calc average bulk size */
-       rec->info += 1;
-
-       /* Record error cases, where no frame were sent */
-       if (ctx->err)
-               rec->err++;
-
-       /* Catch API error of drv ndo_xdp_xmit sent more than count */
-       if (ctx->drops < 0)
-               rec->err++;
-
-       return 1;
-}
index 49ebc49..fb9391a 100644 (file)
@@ -1,15 +1,12 @@
-/* SPDX-License-Identifier: GPL-2.0
- * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
- */
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */
 static const char *__doc__=
- "XDP monitor tool, based on tracepoints\n"
-;
+"XDP monitor tool, based on tracepoints\n";
 
 static const char *__doc_err_only__=
- " NOTICE: Only tracking XDP redirect errors\n"
- "         Enable TX success stats via '--stats'\n"
- "         (which comes with a per packet processing overhead)\n"
-;
+" NOTICE: Only tracking XDP redirect errors\n"
+"         Enable redirect success stats via '-s/--stats'\n"
+"         (which comes with a per packet processing overhead)\n";
 
 #include <errno.h>
 #include <stdio.h>
@@ -20,768 +17,103 @@ static const char *__doc_err_only__=
 #include <ctype.h>
 #include <unistd.h>
 #include <locale.h>
-
 #include <sys/resource.h>
 #include <getopt.h>
 #include <net/if.h>
 #include <time.h>
-
 #include <signal.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 #include "bpf_util.h"
+#include "xdp_sample_user.h"
+#include "xdp_monitor.skel.h"
 
-enum map_type {
-       REDIRECT_ERR_CNT,
-       EXCEPTION_CNT,
-       CPUMAP_ENQUEUE_CNT,
-       CPUMAP_KTHREAD_CNT,
-       DEVMAP_XMIT_CNT,
-};
+static int mask = SAMPLE_REDIRECT_ERR_CNT | SAMPLE_CPUMAP_ENQUEUE_CNT |
+                 SAMPLE_CPUMAP_KTHREAD_CNT | SAMPLE_EXCEPTION_CNT |
+                 SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI;
 
-static const char *const map_type_strings[] = {
-       [REDIRECT_ERR_CNT] = "redirect_err_cnt",
-       [EXCEPTION_CNT] = "exception_cnt",
-       [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
-       [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
-       [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt",
-};
-
-#define NUM_MAP 5
-#define NUM_TP 8
-
-static int tp_cnt;
-static int map_cnt;
-static int verbose = 1;
-static bool debug = false;
-struct bpf_map *map_data[NUM_MAP] = {};
-struct bpf_link *tp_links[NUM_TP] = {};
-struct bpf_object *obj;
+DEFINE_SAMPLE_INIT(xdp_monitor);
 
 static const struct option long_options[] = {
-       {"help",        no_argument,            NULL, 'h' },
-       {"debug",       no_argument,            NULL, 'D' },
-       {"stats",       no_argument,            NULL, 'S' },
-       {"sec",         required_argument,      NULL, 's' },
-       {0, 0, NULL,  0 }
-};
-
-static void int_exit(int sig)
-{
-       /* Detach tracepoints */
-       while (tp_cnt)
-               bpf_link__destroy(tp_links[--tp_cnt]);
-
-       bpf_object__close(obj);
-       exit(0);
-}
-
-/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
-#define EXIT_FAIL_MEM  5
-
-static void usage(char *argv[])
-{
-       int i;
-       printf("\nDOCUMENTATION:\n%s\n", __doc__);
-       printf("\n");
-       printf(" Usage: %s (options-see-below)\n",
-              argv[0]);
-       printf(" Listing options:\n");
-       for (i = 0; long_options[i].name != 0; i++) {
-               printf(" --%-15s", long_options[i].name);
-               if (long_options[i].flag != NULL)
-                       printf(" flag (internal value:%d)",
-                              *long_options[i].flag);
-               else
-                       printf("short-option: -%c",
-                              long_options[i].val);
-               printf("\n");
-       }
-       printf("\n");
-}
-
-#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
-static __u64 gettime(void)
-{
-       struct timespec t;
-       int res;
-
-       res = clock_gettime(CLOCK_MONOTONIC, &t);
-       if (res < 0) {
-               fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
-               exit(EXIT_FAILURE);
-       }
-       return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
-}
-
-enum {
-       REDIR_SUCCESS = 0,
-       REDIR_ERROR = 1,
-};
-#define REDIR_RES_MAX 2
-static const char *redir_names[REDIR_RES_MAX] = {
-       [REDIR_SUCCESS] = "Success",
-       [REDIR_ERROR]   = "Error",
-};
-static const char *err2str(int err)
-{
-       if (err < REDIR_RES_MAX)
-               return redir_names[err];
-       return NULL;
-}
-/* enum xdp_action */
-#define XDP_UNKNOWN    XDP_REDIRECT + 1
-#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
-static const char *xdp_action_names[XDP_ACTION_MAX] = {
-       [XDP_ABORTED]   = "XDP_ABORTED",
-       [XDP_DROP]      = "XDP_DROP",
-       [XDP_PASS]      = "XDP_PASS",
-       [XDP_TX]        = "XDP_TX",
-       [XDP_REDIRECT]  = "XDP_REDIRECT",
-       [XDP_UNKNOWN]   = "XDP_UNKNOWN",
-};
-static const char *action2str(int action)
-{
-       if (action < XDP_ACTION_MAX)
-               return xdp_action_names[action];
-       return NULL;
-}
-
-/* Common stats data record shared with _kern.c */
-struct datarec {
-       __u64 processed;
-       __u64 dropped;
-       __u64 info;
-       __u64 err;
-};
-#define MAX_CPUS 64
-
-/* Userspace structs for collection of stats from maps */
-struct record {
-       __u64 timestamp;
-       struct datarec total;
-       struct datarec *cpu;
+       { "help", no_argument, NULL, 'h' },
+       { "stats", no_argument, NULL, 's' },
+       { "interval", required_argument, NULL, 'i' },
+       { "verbose", no_argument, NULL, 'v' },
+       {}
 };
-struct u64rec {
-       __u64 processed;
-};
-struct record_u64 {
-       /* record for _kern side __u64 values */
-       __u64 timestamp;
-       struct u64rec total;
-       struct u64rec *cpu;
-};
-
-struct stats_record {
-       struct record_u64 xdp_redirect[REDIR_RES_MAX];
-       struct record_u64 xdp_exception[XDP_ACTION_MAX];
-       struct record xdp_cpumap_kthread;
-       struct record xdp_cpumap_enqueue[MAX_CPUS];
-       struct record xdp_devmap_xmit;
-};
-
-static bool map_collect_record(int fd, __u32 key, struct record *rec)
-{
-       /* For percpu maps, userspace gets a value per possible CPU */
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       struct datarec values[nr_cpus];
-       __u64 sum_processed = 0;
-       __u64 sum_dropped = 0;
-       __u64 sum_info = 0;
-       __u64 sum_err = 0;
-       int i;
-
-       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-               fprintf(stderr,
-                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-               return false;
-       }
-       /* Get time as close as possible to reading map contents */
-       rec->timestamp = gettime();
-
-       /* Record and sum values from each CPU */
-       for (i = 0; i < nr_cpus; i++) {
-               rec->cpu[i].processed = values[i].processed;
-               sum_processed        += values[i].processed;
-               rec->cpu[i].dropped = values[i].dropped;
-               sum_dropped        += values[i].dropped;
-               rec->cpu[i].info = values[i].info;
-               sum_info        += values[i].info;
-               rec->cpu[i].err = values[i].err;
-               sum_err        += values[i].err;
-       }
-       rec->total.processed = sum_processed;
-       rec->total.dropped   = sum_dropped;
-       rec->total.info      = sum_info;
-       rec->total.err       = sum_err;
-       return true;
-}
-
-static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
-{
-       /* For percpu maps, userspace gets a value per possible CPU */
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       struct u64rec values[nr_cpus];
-       __u64 sum_total = 0;
-       int i;
-
-       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-               fprintf(stderr,
-                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-               return false;
-       }
-       /* Get time as close as possible to reading map contents */
-       rec->timestamp = gettime();
-
-       /* Record and sum values from each CPU */
-       for (i = 0; i < nr_cpus; i++) {
-               rec->cpu[i].processed = values[i].processed;
-               sum_total            += values[i].processed;
-       }
-       rec->total.processed = sum_total;
-       return true;
-}
-
-static double calc_period(struct record *r, struct record *p)
-{
-       double period_ = 0;
-       __u64 period = 0;
-
-       period = r->timestamp - p->timestamp;
-       if (period > 0)
-               period_ = ((double) period / NANOSEC_PER_SEC);
-
-       return period_;
-}
-
-static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
-{
-       double period_ = 0;
-       __u64 period = 0;
-
-       period = r->timestamp - p->timestamp;
-       if (period > 0)
-               period_ = ((double) period / NANOSEC_PER_SEC);
-
-       return period_;
-}
-
-static double calc_pps(struct datarec *r, struct datarec *p, double period)
-{
-       __u64 packets = 0;
-       double pps = 0;
-
-       if (period > 0) {
-               packets = r->processed - p->processed;
-               pps = packets / period;
-       }
-       return pps;
-}
-
-static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
-{
-       __u64 packets = 0;
-       double pps = 0;
-
-       if (period > 0) {
-               packets = r->processed - p->processed;
-               pps = packets / period;
-       }
-       return pps;
-}
-
-static double calc_drop(struct datarec *r, struct datarec *p, double period)
-{
-       __u64 packets = 0;
-       double pps = 0;
-
-       if (period > 0) {
-               packets = r->dropped - p->dropped;
-               pps = packets / period;
-       }
-       return pps;
-}
-
-static double calc_info(struct datarec *r, struct datarec *p, double period)
-{
-       __u64 packets = 0;
-       double pps = 0;
-
-       if (period > 0) {
-               packets = r->info - p->info;
-               pps = packets / period;
-       }
-       return pps;
-}
-
-static double calc_err(struct datarec *r, struct datarec *p, double period)
-{
-       __u64 packets = 0;
-       double pps = 0;
-
-       if (period > 0) {
-               packets = r->err - p->err;
-               pps = packets / period;
-       }
-       return pps;
-}
-
-static void stats_print(struct stats_record *stats_rec,
-                       struct stats_record *stats_prev,
-                       bool err_only)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       int rec_i = 0, i, to_cpu;
-       double t = 0, pps = 0;
-
-       /* Header */
-       printf("%-15s %-7s %-12s %-12s %-9s\n",
-              "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
-
-       /* tracepoint: xdp:xdp_redirect_* */
-       if (err_only)
-               rec_i = REDIR_ERROR;
-
-       for (; rec_i < REDIR_RES_MAX; rec_i++) {
-               struct record_u64 *rec, *prev;
-               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
-               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
-
-               rec  =  &stats_rec->xdp_redirect[rec_i];
-               prev = &stats_prev->xdp_redirect[rec_i];
-               t = calc_period_u64(rec, prev);
-
-               for (i = 0; i < nr_cpus; i++) {
-                       struct u64rec *r = &rec->cpu[i];
-                       struct u64rec *p = &prev->cpu[i];
-
-                       pps = calc_pps_u64(r, p, t);
-                       if (pps > 0)
-                               printf(fmt1, "XDP_REDIRECT", i,
-                                      rec_i ? 0.0: pps, rec_i ? pps : 0.0,
-                                      err2str(rec_i));
-               }
-               pps = calc_pps_u64(&rec->total, &prev->total, t);
-               printf(fmt2, "XDP_REDIRECT", "total",
-                      rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
-       }
-
-       /* tracepoint: xdp:xdp_exception */
-       for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
-               struct record_u64 *rec, *prev;
-               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
-               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
-
-               rec  =  &stats_rec->xdp_exception[rec_i];
-               prev = &stats_prev->xdp_exception[rec_i];
-               t = calc_period_u64(rec, prev);
-
-               for (i = 0; i < nr_cpus; i++) {
-                       struct u64rec *r = &rec->cpu[i];
-                       struct u64rec *p = &prev->cpu[i];
-
-                       pps = calc_pps_u64(r, p, t);
-                       if (pps > 0)
-                               printf(fmt1, "Exception", i,
-                                      0.0, pps, action2str(rec_i));
-               }
-               pps = calc_pps_u64(&rec->total, &prev->total, t);
-               if (pps > 0)
-                       printf(fmt2, "Exception", "total",
-                              0.0, pps, action2str(rec_i));
-       }
-
-       /* cpumap enqueue stats */
-       for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
-               char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
-               char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
-               struct record *rec, *prev;
-               char *info_str = "";
-               double drop, info;
-
-               rec  =  &stats_rec->xdp_cpumap_enqueue[to_cpu];
-               prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop(r, p, t);
-                       info = calc_info(r, p, t);
-                       if (info > 0) {
-                               info_str = "bulk-average";
-                               info = pps / info; /* calc average bulk size */
-                       }
-                       if (pps > 0)
-                               printf(fmt1, "cpumap-enqueue",
-                                      i, to_cpu, pps, drop, info, info_str);
-               }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               if (pps > 0) {
-                       drop = calc_drop(&rec->total, &prev->total, t);
-                       info = calc_info(&rec->total, &prev->total, t);
-                       if (info > 0) {
-                               info_str = "bulk-average";
-                               info = pps / info; /* calc average bulk size */
-                       }
-                       printf(fmt2, "cpumap-enqueue",
-                              "sum", to_cpu, pps, drop, info, info_str);
-               }
-       }
-
-       /* cpumap kthread stats */
-       {
-               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
-               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
-               struct record *rec, *prev;
-               double drop, info;
-               char *i_str = "";
-
-               rec  =  &stats_rec->xdp_cpumap_kthread;
-               prev = &stats_prev->xdp_cpumap_kthread;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop(r, p, t);
-                       info = calc_info(r, p, t);
-                       if (info > 0)
-                               i_str = "sched";
-                       if (pps > 0 || drop > 0)
-                               printf(fmt1, "cpumap-kthread",
-                                      i, pps, drop, info, i_str);
-               }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               drop = calc_drop(&rec->total, &prev->total, t);
-               info = calc_info(&rec->total, &prev->total, t);
-               if (info > 0)
-                       i_str = "sched-sum";
-               printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
-       }
-
-       /* devmap ndo_xdp_xmit stats */
-       {
-               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
-               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
-               struct record *rec, *prev;
-               double drop, info, err;
-               char *i_str = "";
-               char *err_str = "";
-
-               rec  =  &stats_rec->xdp_devmap_xmit;
-               prev = &stats_prev->xdp_devmap_xmit;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop(r, p, t);
-                       info = calc_info(r, p, t);
-                       err  = calc_err(r, p, t);
-                       if (info > 0) {
-                               i_str = "bulk-average";
-                               info = (pps+drop) / info; /* calc avg bulk */
-                       }
-                       if (err > 0)
-                               err_str = "drv-err";
-                       if (pps > 0 || drop > 0)
-                               printf(fmt1, "devmap-xmit",
-                                      i, pps, drop, info, i_str, err_str);
-               }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               drop = calc_drop(&rec->total, &prev->total, t);
-               info = calc_info(&rec->total, &prev->total, t);
-               err  = calc_err(&rec->total, &prev->total, t);
-               if (info > 0) {
-                       i_str = "bulk-average";
-                       info = (pps+drop) / info; /* calc avg bulk */
-               }
-               if (err > 0)
-                       err_str = "drv-err";
-               printf(fmt2, "devmap-xmit", "total", pps, drop,
-                      info, i_str, err_str);
-       }
-
-       printf("\n");
-}
-
-static bool stats_collect(struct stats_record *rec)
-{
-       int fd;
-       int i;
-
-       /* TODO: Detect if someone unloaded the perf event_fd's, as
-        * this can happen by someone running perf-record -e
-        */
-
-       fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]);
-       for (i = 0; i < REDIR_RES_MAX; i++)
-               map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
-
-       fd = bpf_map__fd(map_data[EXCEPTION_CNT]);
-       for (i = 0; i < XDP_ACTION_MAX; i++) {
-               map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
-       }
-
-       fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]);
-       for (i = 0; i < MAX_CPUS; i++)
-               map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
-
-       fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]);
-       map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
-
-       fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]);
-       map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
-
-       return true;
-}
-
-static void *alloc_rec_per_cpu(int record_size)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       void *array;
-
-       array = calloc(nr_cpus, record_size);
-       if (!array) {
-               fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
-               exit(EXIT_FAIL_MEM);
-       }
-       return array;
-}
-
-static struct stats_record *alloc_stats_record(void)
-{
-       struct stats_record *rec;
-       int rec_sz;
-       int i;
-
-       /* Alloc main stats_record structure */
-       rec = calloc(1, sizeof(*rec));
-       if (!rec) {
-               fprintf(stderr, "Mem alloc error\n");
-               exit(EXIT_FAIL_MEM);
-       }
-
-       /* Alloc stats stored per CPU for each record */
-       rec_sz = sizeof(struct u64rec);
-       for (i = 0; i < REDIR_RES_MAX; i++)
-               rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
-
-       for (i = 0; i < XDP_ACTION_MAX; i++)
-               rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
-
-       rec_sz = sizeof(struct datarec);
-       rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
-       rec->xdp_devmap_xmit.cpu    = alloc_rec_per_cpu(rec_sz);
-
-       for (i = 0; i < MAX_CPUS; i++)
-               rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
-
-       return rec;
-}
-
-static void free_stats_record(struct stats_record *r)
-{
-       int i;
-
-       for (i = 0; i < REDIR_RES_MAX; i++)
-               free(r->xdp_redirect[i].cpu);
-
-       for (i = 0; i < XDP_ACTION_MAX; i++)
-               free(r->xdp_exception[i].cpu);
-
-       free(r->xdp_cpumap_kthread.cpu);
-       free(r->xdp_devmap_xmit.cpu);
-
-       for (i = 0; i < MAX_CPUS; i++)
-               free(r->xdp_cpumap_enqueue[i].cpu);
-
-       free(r);
-}
-
-/* Pointer swap trick */
-static inline void swap(struct stats_record **a, struct stats_record **b)
-{
-       struct stats_record *tmp;
-
-       tmp = *a;
-       *a = *b;
-       *b = tmp;
-}
-
-static void stats_poll(int interval, bool err_only)
-{
-       struct stats_record *rec, *prev;
-
-       rec  = alloc_stats_record();
-       prev = alloc_stats_record();
-       stats_collect(rec);
-
-       if (err_only)
-               printf("\n%s\n", __doc_err_only__);
-
-       /* Trick to pretty printf with thousands separators use %' */
-       setlocale(LC_NUMERIC, "en_US");
-
-       /* Header */
-       if (verbose)
-               printf("\n%s", __doc__);
-
-       /* TODO Need more advanced stats on error types */
-       if (verbose) {
-               printf(" - Stats map0: %s\n", bpf_map__name(map_data[0]));
-               printf(" - Stats map1: %s\n", bpf_map__name(map_data[1]));
-               printf("\n");
-       }
-       fflush(stdout);
-
-       while (1) {
-               swap(&prev, &rec);
-               stats_collect(rec);
-               stats_print(rec, prev, err_only);
-               fflush(stdout);
-               sleep(interval);
-       }
-
-       free_stats_record(rec);
-       free_stats_record(prev);
-}
-
-static void print_bpf_prog_info(void)
-{
-       struct bpf_program *prog;
-       struct bpf_map *map;
-       int i = 0;
-
-       /* Prog info */
-       printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt);
-       bpf_object__for_each_program(prog, obj) {
-               printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog));
-               i++;
-       }
-
-       i = 0;
-       /* Maps info */
-       printf("Loaded BPF prog have %d map(s)\n", map_cnt);
-       bpf_object__for_each_map(map, obj) {
-               const char *name = bpf_map__name(map);
-               int fd           = bpf_map__fd(map);
-
-               printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
-               i++;
-       }
-
-       /* Event info */
-       printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt);
-       for (i = 0; i < tp_cnt; i++) {
-               int fd = bpf_link__fd(tp_links[i]);
-
-               if (fd != -1)
-                       printf(" - event_fd[%d] = fd(%d)\n", i, fd);
-       }
-}
 
 int main(int argc, char **argv)
 {
-       struct bpf_program *prog;
-       int longindex = 0, opt;
-       int ret = EXIT_FAILURE;
-       enum map_type type;
-       char filename[256];
-
-       /* Default settings: */
+       unsigned long interval = 2;
+       int ret = EXIT_FAIL_OPTION;
+       struct xdp_monitor *skel;
        bool errors_only = true;
-       int interval = 2;
+       int longindex = 0, opt;
+       bool error = true;
 
        /* Parse commands line args */
-       while ((opt = getopt_long(argc, argv, "hDSs:",
+       while ((opt = getopt_long(argc, argv, "si:vh",
                                  long_options, &longindex)) != -1) {
                switch (opt) {
-               case 'D':
-                       debug = true;
-                       break;
-               case 'S':
+               case 's':
                        errors_only = false;
+                       mask |= SAMPLE_REDIRECT_CNT;
                        break;
-               case 's':
-                       interval = atoi(optarg);
+               case 'i':
+                       interval = strtoul(optarg, NULL, 0);
+                       break;
+               case 'v':
+                       sample_switch_mode();
                        break;
                case 'h':
+                       error = false;
                default:
-                       usage(argv);
+                       sample_usage(argv, long_options, __doc__, mask, error);
                        return ret;
                }
        }
 
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-       /* Remove tracepoint program when program is interrupted or killed */
-       signal(SIGINT, int_exit);
-       signal(SIGTERM, int_exit);
-
-       obj = bpf_object__open_file(filename, NULL);
-       if (libbpf_get_error(obj)) {
-               printf("ERROR: opening BPF object file failed\n");
-               obj = NULL;
-               goto cleanup;
-       }
-
-       /* load BPF program */
-       if (bpf_object__load(obj)) {
-               printf("ERROR: loading BPF object file failed\n");
-               goto cleanup;
+       skel = xdp_monitor__open();
+       if (!skel) {
+               fprintf(stderr, "Failed to xdp_monitor__open: %s\n",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end;
        }
 
-       for (type = 0; type < NUM_MAP; type++) {
-               map_data[type] =
-                       bpf_object__find_map_by_name(obj, map_type_strings[type]);
-
-               if (libbpf_get_error(map_data[type])) {
-                       printf("ERROR: finding a map in obj file failed\n");
-                       goto cleanup;
-               }
-               map_cnt++;
+       ret = sample_init_pre_load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       bpf_object__for_each_program(prog, obj) {
-               tp_links[tp_cnt] = bpf_program__attach(prog);
-               if (libbpf_get_error(tp_links[tp_cnt])) {
-                       printf("ERROR: bpf_program__attach failed\n");
-                       tp_links[tp_cnt] = NULL;
-                       goto cleanup;
-               }
-               tp_cnt++;
+       ret = xdp_monitor__load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to xdp_monitor__load: %s\n", strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       if (debug) {
-               print_bpf_prog_info();
+       ret = sample_init(skel, mask);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       /* Unload/stop tracepoint event by closing bpf_link's */
-       if (errors_only) {
-               /* The bpf_link[i] depend on the order of
-                * the functions was defined in _kern.c
-                */
-               bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */
-               tp_links[2] = NULL;
+       if (errors_only)
+               printf("%s", __doc_err_only__);
 
-               bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */
-               tp_links[3] = NULL;
+       ret = sample_run(interval, NULL, NULL);
+       if (ret < 0) {
+               fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
        }
-
-       stats_poll(interval, errors_only);
-
-       ret = EXIT_SUCCESS;
-
-cleanup:
-       /* Detach tracepoints */
-       while (tp_cnt)
-               bpf_link__destroy(tp_links[--tp_cnt]);
-
-       bpf_object__close(obj);
-       return ret;
+       ret = EXIT_OK;
+end_destroy:
+       xdp_monitor__destroy(skel);
+end:
+       sample_exit(ret);
 }
diff --git a/samples/bpf/xdp_redirect.bpf.c b/samples/bpf/xdp_redirect.bpf.c
new file mode 100644 (file)
index 0000000..7c02bac
--- /dev/null
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include "vmlinux.h"
+#include "xdp_sample.bpf.h"
+#include "xdp_sample_shared.h"
+
+const volatile int ifindex_out;
+
+SEC("xdp")
+int xdp_redirect_prog(struct xdp_md *ctx)
+{
+       void *data_end = (void *)(long)ctx->data_end;
+       void *data = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
+       struct ethhdr *eth = data;
+       struct datarec *rec;
+       u64 nh_off;
+
+       nh_off = sizeof(*eth);
+       if (data + nh_off > data_end)
+               return XDP_DROP;
+
+       rec = bpf_map_lookup_elem(&rx_cnt, &key);
+       if (!rec)
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
+
+       swap_src_dst_mac(data);
+       return bpf_redirect(ifindex_out, 0);
+}
+
+/* Redirect require an XDP bpf_prog loaded on the TX device */
+SEC("xdp")
+int xdp_redirect_dummy_prog(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
similarity index 52%
rename from samples/bpf/xdp_redirect_cpu_kern.c
rename to samples/bpf/xdp_redirect_cpu.bpf.c
index 8255025..f10fe3c 100644 (file)
@@ -2,74 +2,18 @@
  *
  *  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
-#include <uapi/linux/if_ether.h>
-#include <uapi/linux/if_packet.h>
-#include <uapi/linux/if_vlan.h>
-#include <uapi/linux/ip.h>
-#include <uapi/linux/ipv6.h>
-#include <uapi/linux/in.h>
-#include <uapi/linux/tcp.h>
-#include <uapi/linux/udp.h>
-
-#include <uapi/linux/bpf.h>
-#include <bpf/bpf_helpers.h>
+#include "vmlinux.h"
+#include "xdp_sample.bpf.h"
+#include "xdp_sample_shared.h"
 #include "hash_func01.h"
 
-#define MAX_CPUS NR_CPUS
-
 /* Special map type that can XDP_REDIRECT frames to another CPU */
 struct {
        __uint(type, BPF_MAP_TYPE_CPUMAP);
        __uint(key_size, sizeof(u32));
        __uint(value_size, sizeof(struct bpf_cpumap_val));
-       __uint(max_entries, MAX_CPUS);
 } cpu_map SEC(".maps");
 
-/* Common stats data record to keep userspace more simple */
-struct datarec {
-       __u64 processed;
-       __u64 dropped;
-       __u64 issue;
-       __u64 xdp_pass;
-       __u64 xdp_drop;
-       __u64 xdp_redirect;
-};
-
-/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
- * feedback.  Redirect TX errors can be caught via a tracepoint.
- */
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, 1);
-} rx_cnt SEC(".maps");
-
-/* Used by trace point */
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, 2);
-       /* TODO: have entries for all possible errno's */
-} redirect_err_cnt SEC(".maps");
-
-/* Used by trace point */
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, MAX_CPUS);
-} cpumap_enqueue_cnt SEC(".maps");
-
-/* Used by trace point */
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
-       __uint(max_entries, 1);
-} cpumap_kthread_cnt SEC(".maps");
-
 /* Set of maps controlling available CPU, and for iterating through
  * selectable redirect CPUs.
  */
@@ -77,14 +21,15 @@ struct {
        __uint(type, BPF_MAP_TYPE_ARRAY);
        __type(key, u32);
        __type(value, u32);
-       __uint(max_entries, MAX_CPUS);
 } cpus_available SEC(".maps");
+
 struct {
        __uint(type, BPF_MAP_TYPE_ARRAY);
        __type(key, u32);
        __type(value, u32);
        __uint(max_entries, 1);
 } cpus_count SEC(".maps");
+
 struct {
        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
        __type(key, u32);
@@ -92,24 +37,16 @@ struct {
        __uint(max_entries, 1);
 } cpus_iterator SEC(".maps");
 
-/* Used by trace point */
 struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, struct datarec);
+       __uint(type, BPF_MAP_TYPE_DEVMAP);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(struct bpf_devmap_val));
        __uint(max_entries, 1);
-} exception_cnt SEC(".maps");
+} tx_port SEC(".maps");
 
-/* Helper parse functions */
+char tx_mac_addr[ETH_ALEN];
 
-/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
- *
- * Returns false on error and non-supported ether-type
- */
-struct vlan_hdr {
-       __be16 h_vlan_TCI;
-       __be16 h_vlan_encapsulated_proto;
-};
+/* Helper parse functions */
 
 static __always_inline
 bool parse_eth(struct ethhdr *eth, void *data_end,
@@ -125,11 +62,12 @@ bool parse_eth(struct ethhdr *eth, void *data_end,
        eth_type = eth->h_proto;
 
        /* Skip non 802.3 Ethertypes */
-       if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
+       if (__builtin_expect(bpf_ntohs(eth_type) < ETH_P_802_3_MIN, 0))
                return false;
 
        /* Handle VLAN tagged packet */
-       if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+       if (eth_type == bpf_htons(ETH_P_8021Q) ||
+           eth_type == bpf_htons(ETH_P_8021AD)) {
                struct vlan_hdr *vlan_hdr;
 
                vlan_hdr = (void *)eth + offset;
@@ -139,7 +77,8 @@ bool parse_eth(struct ethhdr *eth, void *data_end,
                eth_type = vlan_hdr->h_vlan_encapsulated_proto;
        }
        /* Handle double VLAN tagged packet */
-       if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+       if (eth_type == bpf_htons(ETH_P_8021Q) ||
+           eth_type == bpf_htons(ETH_P_8021AD)) {
                struct vlan_hdr *vlan_hdr;
 
                vlan_hdr = (void *)eth + offset;
@@ -149,7 +88,7 @@ bool parse_eth(struct ethhdr *eth, void *data_end,
                eth_type = vlan_hdr->h_vlan_encapsulated_proto;
        }
 
-       *eth_proto = ntohs(eth_type);
+       *eth_proto = bpf_ntohs(eth_type);
        *l3_offset = offset;
        return true;
 }
@@ -172,7 +111,7 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
        if (udph + 1 > data_end)
                return 0;
 
-       dport = ntohs(udph->dest);
+       dport = bpf_ntohs(udph->dest);
        return dport;
 }
 
@@ -200,50 +139,48 @@ int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
        return ip6h->nexthdr;
 }
 
-SEC("xdp_cpu_map0")
+SEC("xdp")
 int  xdp_prognum0_no_touch(struct xdp_md *ctx)
 {
-       void *data_end = (void *)(long)ctx->data_end;
-       void *data     = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
        struct datarec *rec;
        u32 *cpu_selected;
-       u32 cpu_dest;
-       u32 key = 0;
+       u32 cpu_dest = 0;
+       u32 key0 = 0;
 
        /* Only use first entry in cpus_available */
-       cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+       cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0);
        if (!cpu_selected)
                return XDP_ABORTED;
        cpu_dest = *cpu_selected;
 
-       /* Count RX packet in map */
        rec = bpf_map_lookup_elem(&rx_cnt, &key);
        if (!rec)
-               return XDP_ABORTED;
-       rec->processed++;
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
-       if (cpu_dest >= MAX_CPUS) {
-               rec->issue++;
+       if (cpu_dest >= nr_cpus) {
+               NO_TEAR_INC(rec->issue);
                return XDP_ABORTED;
        }
-
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
-SEC("xdp_cpu_map1_touch_data")
+SEC("xdp")
 int  xdp_prognum1_touch_data(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data     = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
        struct ethhdr *eth = data;
        struct datarec *rec;
        u32 *cpu_selected;
-       u32 cpu_dest;
+       u32 cpu_dest = 0;
+       u32 key0 = 0;
        u16 eth_type;
-       u32 key = 0;
 
        /* Only use first entry in cpus_available */
-       cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+       cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0);
        if (!cpu_selected)
                return XDP_ABORTED;
        cpu_dest = *cpu_selected;
@@ -252,36 +189,33 @@ int  xdp_prognum1_touch_data(struct xdp_md *ctx)
        if (eth + 1 > data_end)
                return XDP_ABORTED;
 
-       /* Count RX packet in map */
        rec = bpf_map_lookup_elem(&rx_cnt, &key);
        if (!rec)
-               return XDP_ABORTED;
-       rec->processed++;
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
        /* Read packet data, and use it (drop non 802.3 Ethertypes) */
        eth_type = eth->h_proto;
-       if (ntohs(eth_type) < ETH_P_802_3_MIN) {
-               rec->dropped++;
+       if (bpf_ntohs(eth_type) < ETH_P_802_3_MIN) {
+               NO_TEAR_INC(rec->dropped);
                return XDP_DROP;
        }
 
-       if (cpu_dest >= MAX_CPUS) {
-               rec->issue++;
+       if (cpu_dest >= nr_cpus) {
+               NO_TEAR_INC(rec->issue);
                return XDP_ABORTED;
        }
-
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
-SEC("xdp_cpu_map2_round_robin")
+SEC("xdp")
 int  xdp_prognum2_round_robin(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data     = (void *)(long)ctx->data;
-       struct ethhdr *eth = data;
+       u32 key = bpf_get_smp_processor_id();
        struct datarec *rec;
-       u32 cpu_dest;
-       u32 *cpu_lookup;
+       u32 cpu_dest = 0;
        u32 key0 = 0;
 
        u32 *cpu_selected;
@@ -307,40 +241,37 @@ int  xdp_prognum2_round_robin(struct xdp_md *ctx)
                return XDP_ABORTED;
        cpu_dest = *cpu_selected;
 
-       /* Count RX packet in map */
-       rec = bpf_map_lookup_elem(&rx_cnt, &key0);
+       rec = bpf_map_lookup_elem(&rx_cnt, &key);
        if (!rec)
-               return XDP_ABORTED;
-       rec->processed++;
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
-       if (cpu_dest >= MAX_CPUS) {
-               rec->issue++;
+       if (cpu_dest >= nr_cpus) {
+               NO_TEAR_INC(rec->issue);
                return XDP_ABORTED;
        }
-
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
-SEC("xdp_cpu_map3_proto_separate")
+SEC("xdp")
 int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data     = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
        struct ethhdr *eth = data;
        u8 ip_proto = IPPROTO_UDP;
        struct datarec *rec;
        u16 eth_proto = 0;
        u64 l3_offset = 0;
        u32 cpu_dest = 0;
-       u32 cpu_idx = 0;
        u32 *cpu_lookup;
-       u32 key = 0;
+       u32 cpu_idx = 0;
 
-       /* Count RX packet in map */
        rec = bpf_map_lookup_elem(&rx_cnt, &key);
        if (!rec)
-               return XDP_ABORTED;
-       rec->processed++;
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
        if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
                return XDP_PASS; /* Just skip */
@@ -381,35 +312,33 @@ int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
                return XDP_ABORTED;
        cpu_dest = *cpu_lookup;
 
-       if (cpu_dest >= MAX_CPUS) {
-               rec->issue++;
+       if (cpu_dest >= nr_cpus) {
+               NO_TEAR_INC(rec->issue);
                return XDP_ABORTED;
        }
-
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
-SEC("xdp_cpu_map4_ddos_filter_pktgen")
+SEC("xdp")
 int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data     = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
        struct ethhdr *eth = data;
        u8 ip_proto = IPPROTO_UDP;
        struct datarec *rec;
        u16 eth_proto = 0;
        u64 l3_offset = 0;
        u32 cpu_dest = 0;
+       u32 *cpu_lookup;
        u32 cpu_idx = 0;
        u16 dest_port;
-       u32 *cpu_lookup;
-       u32 key = 0;
 
-       /* Count RX packet in map */
        rec = bpf_map_lookup_elem(&rx_cnt, &key);
        if (!rec)
-               return XDP_ABORTED;
-       rec->processed++;
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
        if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
                return XDP_PASS; /* Just skip */
@@ -443,8 +372,7 @@ int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
                /* DDoS filter UDP port 9 (pktgen) */
                dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
                if (dest_port == 9) {
-                       if (rec)
-                               rec->dropped++;
+                       NO_TEAR_INC(rec->dropped);
                        return XDP_DROP;
                }
                break;
@@ -457,11 +385,10 @@ int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
                return XDP_ABORTED;
        cpu_dest = *cpu_lookup;
 
-       if (cpu_dest >= MAX_CPUS) {
-               rec->issue++;
+       if (cpu_dest >= nr_cpus) {
+               NO_TEAR_INC(rec->issue);
                return XDP_ABORTED;
        }
-
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
@@ -496,10 +423,10 @@ u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
        if (ip6h + 1 > data_end)
                return 0;
 
-       cpu_hash  = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
-       cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
-       cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
-       cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
+       cpu_hash  = ip6h->saddr.in6_u.u6_addr32[0] + ip6h->daddr.in6_u.u6_addr32[0];
+       cpu_hash += ip6h->saddr.in6_u.u6_addr32[1] + ip6h->daddr.in6_u.u6_addr32[1];
+       cpu_hash += ip6h->saddr.in6_u.u6_addr32[2] + ip6h->daddr.in6_u.u6_addr32[2];
+       cpu_hash += ip6h->saddr.in6_u.u6_addr32[3] + ip6h->daddr.in6_u.u6_addr32[3];
        cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);
 
        return cpu_hash;
@@ -509,30 +436,29 @@ u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
  * hashing scheme is symmetric, meaning swapping IP src/dest still hit
  * same CPU.
  */
-SEC("xdp_cpu_map5_lb_hash_ip_pairs")
+SEC("xdp")
 int  xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data     = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
        struct ethhdr *eth = data;
-       u8 ip_proto = IPPROTO_UDP;
        struct datarec *rec;
        u16 eth_proto = 0;
        u64 l3_offset = 0;
        u32 cpu_dest = 0;
        u32 cpu_idx = 0;
        u32 *cpu_lookup;
+       u32 key0 = 0;
        u32 *cpu_max;
        u32 cpu_hash;
-       u32 key = 0;
 
-       /* Count RX packet in map */
        rec = bpf_map_lookup_elem(&rx_cnt, &key);
        if (!rec)
-               return XDP_ABORTED;
-       rec->processed++;
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
-       cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
+       cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
        if (!cpu_max)
                return XDP_ABORTED;
 
@@ -560,171 +486,56 @@ int  xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
                return XDP_ABORTED;
        cpu_dest = *cpu_lookup;
 
-       if (cpu_dest >= MAX_CPUS) {
-               rec->issue++;
+       if (cpu_dest >= nr_cpus) {
+               NO_TEAR_INC(rec->issue);
                return XDP_ABORTED;
        }
-
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
-char _license[] SEC("license") = "GPL";
-
-/*** Trace point code ***/
-
-/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
- * Code in:                kernel/include/trace/events/xdp.h
- */
-struct xdp_redirect_ctx {
-       u64 __pad;      // First 8 bytes are not accessible by bpf code
-       int prog_id;    //      offset:8;  size:4; signed:1;
-       u32 act;        //      offset:12  size:4; signed:0;
-       int ifindex;    //      offset:16  size:4; signed:1;
-       int err;        //      offset:20  size:4; signed:1;
-       int to_ifindex; //      offset:24  size:4; signed:1;
-       u32 map_id;     //      offset:28  size:4; signed:0;
-       int map_index;  //      offset:32  size:4; signed:1;
-};                     //      offset:36
-
-enum {
-       XDP_REDIRECT_SUCCESS = 0,
-       XDP_REDIRECT_ERROR = 1
-};
-
-static __always_inline
-int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
+SEC("xdp_cpumap/redirect")
+int xdp_redirect_cpu_devmap(struct xdp_md *ctx)
 {
-       u32 key = XDP_REDIRECT_ERROR;
-       struct datarec *rec;
-       int err = ctx->err;
+       void *data_end = (void *)(long)ctx->data_end;
+       void *data = (void *)(long)ctx->data;
+       struct ethhdr *eth = data;
+       u64 nh_off;
 
-       if (!err)
-               key = XDP_REDIRECT_SUCCESS;
+       nh_off = sizeof(*eth);
+       if (data + nh_off > data_end)
+               return XDP_DROP;
 
-       rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
-       if (!rec)
-               return 0;
-       rec->dropped += 1;
-
-       return 0; /* Indicate event was filtered (no further processing)*/
-       /*
-        * Returning 1 here would allow e.g. a perf-record tracepoint
-        * to see and record these events, but it doesn't work well
-        * in-practice as stopping perf-record also unload this
-        * bpf_prog.  Plus, there is additional overhead of doing so.
-        */
+       swap_src_dst_mac(data);
+       return bpf_redirect_map(&tx_port, 0, 0);
 }
 
-SEC("tracepoint/xdp/xdp_redirect_err")
-int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
+SEC("xdp_cpumap/pass")
+int xdp_redirect_cpu_pass(struct xdp_md *ctx)
 {
-       return xdp_redirect_collect_stat(ctx);
+       return XDP_PASS;
 }
 
-SEC("tracepoint/xdp/xdp_redirect_map_err")
-int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
+SEC("xdp_cpumap/drop")
+int xdp_redirect_cpu_drop(struct xdp_md *ctx)
 {
-       return xdp_redirect_collect_stat(ctx);
+       return XDP_DROP;
 }
 
-/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
- * Code in:                kernel/include/trace/events/xdp.h
- */
-struct xdp_exception_ctx {
-       u64 __pad;      // First 8 bytes are not accessible by bpf code
-       int prog_id;    //      offset:8;  size:4; signed:1;
-       u32 act;        //      offset:12; size:4; signed:0;
-       int ifindex;    //      offset:16; size:4; signed:1;
-};
-
-SEC("tracepoint/xdp/xdp_exception")
-int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+SEC("xdp_devmap/egress")
+int xdp_redirect_egress_prog(struct xdp_md *ctx)
 {
-       struct datarec *rec;
-       u32 key = 0;
-
-       rec = bpf_map_lookup_elem(&exception_cnt, &key);
-       if (!rec)
-               return 1;
-       rec->dropped += 1;
-
-       return 0;
-}
+       void *data_end = (void *)(long)ctx->data_end;
+       void *data = (void *)(long)ctx->data;
+       struct ethhdr *eth = data;
+       u64 nh_off;
 
-/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
- * Code in:         kernel/include/trace/events/xdp.h
- */
-struct cpumap_enqueue_ctx {
-       u64 __pad;              // First 8 bytes are not accessible by bpf code
-       int map_id;             //      offset:8;  size:4; signed:1;
-       u32 act;                //      offset:12; size:4; signed:0;
-       int cpu;                //      offset:16; size:4; signed:1;
-       unsigned int drops;     //      offset:20; size:4; signed:0;
-       unsigned int processed; //      offset:24; size:4; signed:0;
-       int to_cpu;             //      offset:28; size:4; signed:1;
-};
-
-SEC("tracepoint/xdp/xdp_cpumap_enqueue")
-int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
-{
-       u32 to_cpu = ctx->to_cpu;
-       struct datarec *rec;
+       nh_off = sizeof(*eth);
+       if (data + nh_off > data_end)
+               return XDP_DROP;
 
-       if (to_cpu >= MAX_CPUS)
-               return 1;
+       __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN);
 
-       rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
-       if (!rec)
-               return 0;
-       rec->processed += ctx->processed;
-       rec->dropped   += ctx->drops;
-
-       /* Record bulk events, then userspace can calc average bulk size */
-       if (ctx->processed > 0)
-               rec->issue += 1;
-
-       /* Inception: It's possible to detect overload situations, via
-        * this tracepoint.  This can be used for creating a feedback
-        * loop to XDP, which can take appropriate actions to mitigate
-        * this overload situation.
-        */
-       return 0;
+       return XDP_PASS;
 }
 
-/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
- * Code in:         kernel/include/trace/events/xdp.h
- */
-struct cpumap_kthread_ctx {
-       u64 __pad;                      // First 8 bytes are not accessible
-       int map_id;                     //      offset:8;  size:4; signed:1;
-       u32 act;                        //      offset:12; size:4; signed:0;
-       int cpu;                        //      offset:16; size:4; signed:1;
-       unsigned int drops;             //      offset:20; size:4; signed:0;
-       unsigned int processed;         //      offset:24; size:4; signed:0;
-       int sched;                      //      offset:28; size:4; signed:1;
-       unsigned int xdp_pass;          //      offset:32; size:4; signed:0;
-       unsigned int xdp_drop;          //      offset:36; size:4; signed:0;
-       unsigned int xdp_redirect;      //      offset:40; size:4; signed:0;
-};
-
-SEC("tracepoint/xdp/xdp_cpumap_kthread")
-int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
-{
-       struct datarec *rec;
-       u32 key = 0;
-
-       rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
-       if (!rec)
-               return 0;
-       rec->processed += ctx->processed;
-       rec->dropped   += ctx->drops;
-       rec->xdp_pass  += ctx->xdp_pass;
-       rec->xdp_drop  += ctx->xdp_drop;
-       rec->xdp_redirect  += ctx->xdp_redirect;
-
-       /* Count times kthread yielded CPU via schedule call */
-       if (ctx->sched)
-               rec->issue++;
-
-       return 0;
-}
+char _license[] SEC("license") = "GPL";
index 9e225c9..6e25fba 100644 (file)
@@ -2,7 +2,16 @@
 /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 static const char *__doc__ =
-       " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
+"XDP CPU redirect tool, using BPF_MAP_TYPE_CPUMAP\n"
+"Usage: xdp_redirect_cpu -d <IFINDEX|IFNAME> -c 0 ... -c N\n"
+"Valid specification for CPUMAP BPF program:\n"
+"  --mprog-name/-e pass (use built-in XDP_PASS program)\n"
+"  --mprog-name/-e drop (use built-in XDP_DROP program)\n"
+"  --redirect-device/-r <ifindex|ifname> (use built-in DEVMAP redirect program)\n"
+"  Custom CPUMAP BPF program:\n"
+"    --mprog-filename/-f <filename> --mprog-name/-e <program>\n"
+"    Optionally, also pass --redirect-map/-m and --redirect-device/-r together\n"
+"    to configure DEVMAP in BPF object <filename>\n";
 
 #include <errno.h>
 #include <signal.h>
@@ -18,558 +27,62 @@ static const char *__doc__ =
 #include <net/if.h>
 #include <time.h>
 #include <linux/limits.h>
-
 #include <arpa/inet.h>
 #include <linux/if_link.h>
-
-/* How many xdp_progs are defined in _kern.c */
-#define MAX_PROG 6
-
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
-
 #include "bpf_util.h"
+#include "xdp_sample_user.h"
+#include "xdp_redirect_cpu.skel.h"
 
-static int ifindex = -1;
-static char ifname_buf[IF_NAMESIZE];
-static char *ifname;
-static __u32 prog_id;
-
-static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-static int n_cpus;
-
-enum map_type {
-       CPU_MAP,
-       RX_CNT,
-       REDIRECT_ERR_CNT,
-       CPUMAP_ENQUEUE_CNT,
-       CPUMAP_KTHREAD_CNT,
-       CPUS_AVAILABLE,
-       CPUS_COUNT,
-       CPUS_ITERATOR,
-       EXCEPTION_CNT,
-};
+static int map_fd;
+static int avail_fd;
+static int count_fd;
 
-static const char *const map_type_strings[] = {
-       [CPU_MAP] = "cpu_map",
-       [RX_CNT] = "rx_cnt",
-       [REDIRECT_ERR_CNT] = "redirect_err_cnt",
-       [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
-       [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
-       [CPUS_AVAILABLE] = "cpus_available",
-       [CPUS_COUNT] = "cpus_count",
-       [CPUS_ITERATOR] = "cpus_iterator",
-       [EXCEPTION_CNT] = "exception_cnt",
-};
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT |
+                 SAMPLE_CPUMAP_ENQUEUE_CNT | SAMPLE_CPUMAP_KTHREAD_CNT |
+                 SAMPLE_EXCEPTION_CNT;
 
-#define NUM_TP 5
-#define NUM_MAP 9
-struct bpf_link *tp_links[NUM_TP] = {};
-static int map_fds[NUM_MAP];
-static int tp_cnt = 0;
-
-/* Exit return codes */
-#define EXIT_OK                0
-#define EXIT_FAIL              1
-#define EXIT_FAIL_OPTION       2
-#define EXIT_FAIL_XDP          3
-#define EXIT_FAIL_BPF          4
-#define EXIT_FAIL_MEM          5
+DEFINE_SAMPLE_INIT(xdp_redirect_cpu);
 
 static const struct option long_options[] = {
-       {"help",        no_argument,            NULL, 'h' },
-       {"dev",         required_argument,      NULL, 'd' },
-       {"skb-mode",    no_argument,            NULL, 'S' },
-       {"sec",         required_argument,      NULL, 's' },
-       {"progname",    required_argument,      NULL, 'p' },
-       {"qsize",       required_argument,      NULL, 'q' },
-       {"cpu",         required_argument,      NULL, 'c' },
-       {"stress-mode", no_argument,            NULL, 'x' },
-       {"no-separators", no_argument,          NULL, 'z' },
-       {"force",       no_argument,            NULL, 'F' },
-       {"mprog-disable", no_argument,          NULL, 'n' },
-       {"mprog-name",  required_argument,      NULL, 'e' },
-       {"mprog-filename", required_argument,   NULL, 'f' },
-       {"redirect-device", required_argument,  NULL, 'r' },
-       {"redirect-map", required_argument,     NULL, 'm' },
-       {0, 0, NULL,  0 }
+       { "help", no_argument, NULL, 'h' },
+       { "dev", required_argument, NULL, 'd' },
+       { "skb-mode", no_argument, NULL, 'S' },
+       { "progname", required_argument, NULL, 'p' },
+       { "qsize", required_argument, NULL, 'q' },
+       { "cpu", required_argument, NULL, 'c' },
+       { "stress-mode", no_argument, NULL, 'x' },
+       { "force", no_argument, NULL, 'F' },
+       { "interval", required_argument, NULL, 'i' },
+       { "verbose", no_argument, NULL, 'v' },
+       { "stats", no_argument, NULL, 's' },
+       { "mprog-name", required_argument, NULL, 'e' },
+       { "mprog-filename", required_argument, NULL, 'f' },
+       { "redirect-device", required_argument, NULL, 'r' },
+       { "redirect-map", required_argument, NULL, 'm' },
+       {}
 };
 
-static void int_exit(int sig)
-{
-       __u32 curr_prog_id = 0;
-
-       if (ifindex > -1) {
-               if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
-                       exit(EXIT_FAIL);
-               }
-               if (prog_id == curr_prog_id) {
-                       fprintf(stderr,
-                               "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
-                               ifindex, ifname);
-                       bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
-               } else if (!curr_prog_id) {
-                       printf("couldn't find a prog id on a given iface\n");
-               } else {
-                       printf("program on interface changed, not removing\n");
-               }
-       }
-       /* Detach tracepoints */
-       while (tp_cnt)
-               bpf_link__destroy(tp_links[--tp_cnt]);
-
-       exit(EXIT_OK);
-}
-
 static void print_avail_progs(struct bpf_object *obj)
 {
        struct bpf_program *pos;
 
+       printf(" Programs to be used for -p/--progname:\n");
        bpf_object__for_each_program(pos, obj) {
-               if (bpf_program__is_xdp(pos))
-                       printf(" %s\n", bpf_program__section_name(pos));
-       }
-}
-
-static void usage(char *argv[], struct bpf_object *obj)
-{
-       int i;
-
-       printf("\nDOCUMENTATION:\n%s\n", __doc__);
-       printf("\n");
-       printf(" Usage: %s (options-see-below)\n", argv[0]);
-       printf(" Listing options:\n");
-       for (i = 0; long_options[i].name != 0; i++) {
-               printf(" --%-12s", long_options[i].name);
-               if (long_options[i].flag != NULL)
-                       printf(" flag (internal value:%d)",
-                               *long_options[i].flag);
-               else
-                       printf(" short-option: -%c",
-                               long_options[i].val);
-               printf("\n");
-       }
-       printf("\n Programs to be used for --progname:\n");
-       print_avail_progs(obj);
-       printf("\n");
-}
-
-/* gettime returns the current time of day in nanoseconds.
- * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
- *       clock_gettime (ns) =>  9ns (CLOCK_MONOTONIC_COARSE)
- */
-#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
-static __u64 gettime(void)
-{
-       struct timespec t;
-       int res;
-
-       res = clock_gettime(CLOCK_MONOTONIC, &t);
-       if (res < 0) {
-               fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
-               exit(EXIT_FAIL);
-       }
-       return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
-}
-
-/* Common stats data record shared with _kern.c */
-struct datarec {
-       __u64 processed;
-       __u64 dropped;
-       __u64 issue;
-       __u64 xdp_pass;
-       __u64 xdp_drop;
-       __u64 xdp_redirect;
-};
-struct record {
-       __u64 timestamp;
-       struct datarec total;
-       struct datarec *cpu;
-};
-struct stats_record {
-       struct record rx_cnt;
-       struct record redir_err;
-       struct record kthread;
-       struct record exception;
-       struct record enq[];
-};
-
-static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
-{
-       /* For percpu maps, userspace gets a value per possible CPU */
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       struct datarec values[nr_cpus];
-       __u64 sum_xdp_redirect = 0;
-       __u64 sum_xdp_pass = 0;
-       __u64 sum_xdp_drop = 0;
-       __u64 sum_processed = 0;
-       __u64 sum_dropped = 0;
-       __u64 sum_issue = 0;
-       int i;
-
-       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-               fprintf(stderr,
-                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-               return false;
-       }
-       /* Get time as close as possible to reading map contents */
-       rec->timestamp = gettime();
-
-       /* Record and sum values from each CPU */
-       for (i = 0; i < nr_cpus; i++) {
-               rec->cpu[i].processed = values[i].processed;
-               sum_processed        += values[i].processed;
-               rec->cpu[i].dropped = values[i].dropped;
-               sum_dropped        += values[i].dropped;
-               rec->cpu[i].issue = values[i].issue;
-               sum_issue        += values[i].issue;
-               rec->cpu[i].xdp_pass = values[i].xdp_pass;
-               sum_xdp_pass += values[i].xdp_pass;
-               rec->cpu[i].xdp_drop = values[i].xdp_drop;
-               sum_xdp_drop += values[i].xdp_drop;
-               rec->cpu[i].xdp_redirect = values[i].xdp_redirect;
-               sum_xdp_redirect += values[i].xdp_redirect;
-       }
-       rec->total.processed = sum_processed;
-       rec->total.dropped   = sum_dropped;
-       rec->total.issue     = sum_issue;
-       rec->total.xdp_pass  = sum_xdp_pass;
-       rec->total.xdp_drop  = sum_xdp_drop;
-       rec->total.xdp_redirect = sum_xdp_redirect;
-       return true;
-}
-
-static struct datarec *alloc_record_per_cpu(void)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       struct datarec *array;
-
-       array = calloc(nr_cpus, sizeof(struct datarec));
-       if (!array) {
-               fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
-               exit(EXIT_FAIL_MEM);
-       }
-       return array;
-}
-
-static struct stats_record *alloc_stats_record(void)
-{
-       struct stats_record *rec;
-       int i, size;
-
-       size = sizeof(*rec) + n_cpus * sizeof(struct record);
-       rec = malloc(size);
-       if (!rec) {
-               fprintf(stderr, "Mem alloc error\n");
-               exit(EXIT_FAIL_MEM);
-       }
-       memset(rec, 0, size);
-       rec->rx_cnt.cpu    = alloc_record_per_cpu();
-       rec->redir_err.cpu = alloc_record_per_cpu();
-       rec->kthread.cpu   = alloc_record_per_cpu();
-       rec->exception.cpu = alloc_record_per_cpu();
-       for (i = 0; i < n_cpus; i++)
-               rec->enq[i].cpu = alloc_record_per_cpu();
-
-       return rec;
-}
-
-static void free_stats_record(struct stats_record *r)
-{
-       int i;
-
-       for (i = 0; i < n_cpus; i++)
-               free(r->enq[i].cpu);
-       free(r->exception.cpu);
-       free(r->kthread.cpu);
-       free(r->redir_err.cpu);
-       free(r->rx_cnt.cpu);
-       free(r);
-}
-
-static double calc_period(struct record *r, struct record *p)
-{
-       double period_ = 0;
-       __u64 period = 0;
-
-       period = r->timestamp - p->timestamp;
-       if (period > 0)
-               period_ = ((double) period / NANOSEC_PER_SEC);
-
-       return period_;
-}
-
-static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
-{
-       __u64 packets = 0;
-       __u64 pps = 0;
-
-       if (period_ > 0) {
-               packets = r->processed - p->processed;
-               pps = packets / period_;
-       }
-       return pps;
-}
-
-static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
-{
-       __u64 packets = 0;
-       __u64 pps = 0;
-
-       if (period_ > 0) {
-               packets = r->dropped - p->dropped;
-               pps = packets / period_;
-       }
-       return pps;
-}
-
-static __u64 calc_errs_pps(struct datarec *r,
-                           struct datarec *p, double period_)
-{
-       __u64 packets = 0;
-       __u64 pps = 0;
-
-       if (period_ > 0) {
-               packets = r->issue - p->issue;
-               pps = packets / period_;
-       }
-       return pps;
-}
-
-static void calc_xdp_pps(struct datarec *r, struct datarec *p,
-                        double *xdp_pass, double *xdp_drop,
-                        double *xdp_redirect, double period_)
-{
-       *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0;
-       if (period_ > 0) {
-               *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_;
-               *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_;
-               *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_;
-       }
-}
-
-static void stats_print(struct stats_record *stats_rec,
-                       struct stats_record *stats_prev,
-                       char *prog_name, char *mprog_name, int mprog_fd)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       double pps = 0, drop = 0, err = 0;
-       bool mprog_enabled = false;
-       struct record *rec, *prev;
-       int to_cpu;
-       double t;
-       int i;
-
-       if (mprog_fd > 0)
-               mprog_enabled = true;
-
-       /* Header */
-       printf("Running XDP/eBPF prog_name:%s\n", prog_name);
-       printf("%-15s %-7s %-14s %-11s %-9s\n",
-              "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
-
-       /* XDP rx_cnt */
-       {
-               char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
-               char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
-               char *errstr = "";
-
-               rec  = &stats_rec->rx_cnt;
-               prev = &stats_prev->rx_cnt;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps = calc_pps(r, p, t);
-                       drop = calc_drop_pps(r, p, t);
-                       err  = calc_errs_pps(r, p, t);
-                       if (err > 0)
-                               errstr = "cpu-dest/err";
-                       if (pps > 0)
-                               printf(fmt_rx, "XDP-RX",
-                                       i, pps, drop, err, errstr);
-               }
-               pps  = calc_pps(&rec->total, &prev->total, t);
-               drop = calc_drop_pps(&rec->total, &prev->total, t);
-               err  = calc_errs_pps(&rec->total, &prev->total, t);
-               printf(fm2_rx, "XDP-RX", "total", pps, drop);
-       }
-
-       /* cpumap enqueue stats */
-       for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) {
-               char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
-               char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
-               char *errstr = "";
-
-               rec  =  &stats_rec->enq[to_cpu];
-               prev = &stats_prev->enq[to_cpu];
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop_pps(r, p, t);
-                       err  = calc_errs_pps(r, p, t);
-                       if (err > 0) {
-                               errstr = "bulk-average";
-                               err = pps / err; /* calc average bulk size */
-                       }
-                       if (pps > 0)
-                               printf(fmt, "cpumap-enqueue",
-                                      i, to_cpu, pps, drop, err, errstr);
-               }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               if (pps > 0) {
-                       drop = calc_drop_pps(&rec->total, &prev->total, t);
-                       err  = calc_errs_pps(&rec->total, &prev->total, t);
-                       if (err > 0) {
-                               errstr = "bulk-average";
-                               err = pps / err; /* calc average bulk size */
-                       }
-                       printf(fm2, "cpumap-enqueue",
-                              "sum", to_cpu, pps, drop, err, errstr);
-               }
-       }
-
-       /* cpumap kthread stats */
-       {
-               char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
-               char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
-               char *e_str = "";
-
-               rec  = &stats_rec->kthread;
-               prev = &stats_prev->kthread;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop_pps(r, p, t);
-                       err  = calc_errs_pps(r, p, t);
-                       if (err > 0)
-                               e_str = "sched";
-                       if (pps > 0)
-                               printf(fmt_k, "cpumap_kthread",
-                                      i, pps, drop, err, e_str);
-               }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               drop = calc_drop_pps(&rec->total, &prev->total, t);
-               err  = calc_errs_pps(&rec->total, &prev->total, t);
-               if (err > 0)
-                       e_str = "sched-sum";
-               printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str);
-       }
-
-       /* XDP redirect err tracepoints (very unlikely) */
-       {
-               char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
-               char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
-
-               rec  = &stats_rec->redir_err;
-               prev = &stats_prev->redir_err;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop_pps(r, p, t);
-                       if (pps > 0)
-                               printf(fmt_err, "redirect_err", i, pps, drop);
+               if (bpf_program__is_xdp(pos)) {
+                       if (!strncmp(bpf_program__name(pos), "xdp_prognum",
+                                    sizeof("xdp_prognum") - 1))
+                               printf(" %s\n", bpf_program__name(pos));
                }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               drop = calc_drop_pps(&rec->total, &prev->total, t);
-               printf(fm2_err, "redirect_err", "total", pps, drop);
        }
-
-       /* XDP general exception tracepoints */
-       {
-               char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
-               char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
-
-               rec  = &stats_rec->exception;
-               prev = &stats_prev->exception;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       pps  = calc_pps(r, p, t);
-                       drop = calc_drop_pps(r, p, t);
-                       if (pps > 0)
-                               printf(fmt_err, "xdp_exception", i, pps, drop);
-               }
-               pps = calc_pps(&rec->total, &prev->total, t);
-               drop = calc_drop_pps(&rec->total, &prev->total, t);
-               printf(fm2_err, "xdp_exception", "total", pps, drop);
-       }
-
-       /* CPUMAP attached XDP program that runs on remote/destination CPU */
-       if (mprog_enabled) {
-               char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f\n";
-               char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f\n";
-               double xdp_pass, xdp_drop, xdp_redirect;
-
-               printf("\n2nd remote XDP/eBPF prog_name: %s\n", mprog_name);
-               printf("%-15s %-7s %-14s %-11s %-9s\n",
-                      "XDP-cpumap", "CPU:to", "xdp-pass", "xdp-drop", "xdp-redir");
-
-               rec  = &stats_rec->kthread;
-               prev = &stats_prev->kthread;
-               t = calc_period(rec, prev);
-               for (i = 0; i < nr_cpus; i++) {
-                       struct datarec *r = &rec->cpu[i];
-                       struct datarec *p = &prev->cpu[i];
-
-                       calc_xdp_pps(r, p, &xdp_pass, &xdp_drop,
-                                    &xdp_redirect, t);
-                       if (xdp_pass > 0 || xdp_drop > 0 || xdp_redirect > 0)
-                               printf(fmt_k, "xdp-in-kthread", i, xdp_pass, xdp_drop,
-                                      xdp_redirect);
-               }
-               calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop,
-                            &xdp_redirect, t);
-               printf(fm2_k, "xdp-in-kthread", "total", xdp_pass, xdp_drop, xdp_redirect);
-       }
-
-       printf("\n");
-       fflush(stdout);
-}
-
-static void stats_collect(struct stats_record *rec)
-{
-       int fd, i;
-
-       fd = map_fds[RX_CNT];
-       map_collect_percpu(fd, 0, &rec->rx_cnt);
-
-       fd = map_fds[REDIRECT_ERR_CNT];
-       map_collect_percpu(fd, 1, &rec->redir_err);
-
-       fd = map_fds[CPUMAP_ENQUEUE_CNT];
-       for (i = 0; i < n_cpus; i++)
-               map_collect_percpu(fd, i, &rec->enq[i]);
-
-       fd = map_fds[CPUMAP_KTHREAD_CNT];
-       map_collect_percpu(fd, 0, &rec->kthread);
-
-       fd = map_fds[EXCEPTION_CNT];
-       map_collect_percpu(fd, 0, &rec->exception);
 }
 
-
-/* Pointer swap trick */
-static inline void swap(struct stats_record **a, struct stats_record **b)
+static void usage(char *argv[], const struct option *long_options,
+                 const char *doc, int mask, bool error, struct bpf_object *obj)
 {
-       struct stats_record *tmp;
-
-       tmp = *a;
-       *a = *b;
-       *b = tmp;
+       sample_usage(argv, long_options, doc, mask, error);
+       print_avail_progs(obj);
 }
 
 static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
@@ -582,39 +95,41 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
        /* Add a CPU entry to cpumap, as this allocate a cpu entry in
         * the kernel for the cpu.
         */
-       ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0);
-       if (ret) {
-               fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
-               exit(EXIT_FAIL_BPF);
+       ret = bpf_map_update_elem(map_fd, &cpu, value, 0);
+       if (ret < 0) {
+               fprintf(stderr, "Create CPU entry failed: %s\n", strerror(errno));
+               return ret;
        }
 
        /* Inform bpf_prog's that a new CPU is available to select
         * from via some control maps.
         */
-       ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0);
-       if (ret) {
-               fprintf(stderr, "Add to avail CPUs failed\n");
-               exit(EXIT_FAIL_BPF);
+       ret = bpf_map_update_elem(avail_fd, &avail_idx, &cpu, 0);
+       if (ret < 0) {
+               fprintf(stderr, "Add to avail CPUs failed: %s\n", strerror(errno));
+               return ret;
        }
 
        /* When not replacing/updating existing entry, bump the count */
-       ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count);
-       if (ret) {
-               fprintf(stderr, "Failed reading curr cpus_count\n");
-               exit(EXIT_FAIL_BPF);
+       ret = bpf_map_lookup_elem(count_fd, &key, &curr_cpus_count);
+       if (ret < 0) {
+               fprintf(stderr, "Failed reading curr cpus_count: %s\n",
+                       strerror(errno));
+               return ret;
        }
        if (new) {
                curr_cpus_count++;
-               ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key,
+               ret = bpf_map_update_elem(count_fd, &key,
                                          &curr_cpus_count, 0);
-               if (ret) {
-                       fprintf(stderr, "Failed write curr cpus_count\n");
-                       exit(EXIT_FAIL_BPF);
+               if (ret < 0) {
+                       fprintf(stderr, "Failed write curr cpus_count: %s\n",
+                               strerror(errno));
+                       return ret;
                }
        }
-       /* map_fd[7] = cpus_iterator */
-       printf("%s CPU:%u as idx:%u qsize:%d prog_fd: %d (cpus_count:%u)\n",
-              new ? "Add-new":"Replace", cpu, avail_idx,
+
+       printf("%s CPU: %u as idx: %u qsize: %d cpumap_prog_fd: %d (cpus_count: %u)\n",
+              new ? "Add new" : "Replace", cpu, avail_idx,
               value->qsize, value->bpf_prog.fd, curr_cpus_count);
 
        return 0;
@@ -623,24 +138,29 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
 /* CPUs are zero-indexed. Thus, add a special sentinel default value
  * in map cpus_available to mark CPU index'es not configured
  */
-static void mark_cpus_unavailable(void)
+static int mark_cpus_unavailable(void)
 {
+       int ret, i, n_cpus = libbpf_num_possible_cpus();
        __u32 invalid_cpu = n_cpus;
-       int ret, i;
 
        for (i = 0; i < n_cpus; i++) {
-               ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i,
+               ret = bpf_map_update_elem(avail_fd, &i,
                                          &invalid_cpu, 0);
-               if (ret) {
-                       fprintf(stderr, "Failed marking CPU unavailable\n");
-                       exit(EXIT_FAIL_BPF);
+               if (ret < 0) {
+                       fprintf(stderr, "Failed marking CPU unavailable: %s\n",
+                               strerror(errno));
+                       return ret;
                }
        }
+
+       return 0;
 }
 
 /* Stress cpumap management code by concurrently changing underlying cpumap */
-static void stress_cpumap(struct bpf_cpumap_val *value)
+static void stress_cpumap(void *ctx)
 {
+       struct bpf_cpumap_val *value = ctx;
+
        /* Changing qsize will cause kernel to free and alloc a new
         * bpf_cpu_map_entry, with an associated/complicated tear-down
         * procedure.
@@ -653,144 +173,163 @@ static void stress_cpumap(struct bpf_cpumap_val *value)
        create_cpu_entry(1, value, 0, false);
 }
 
-static void stats_poll(int interval, bool use_separators, char *prog_name,
-                      char *mprog_name, struct bpf_cpumap_val *value,
-                      bool stress_mode)
-{
-       struct stats_record *record, *prev;
-       int mprog_fd;
-
-       record = alloc_stats_record();
-       prev   = alloc_stats_record();
-       stats_collect(record);
-
-       /* Trick to pretty printf with thousands separators use %' */
-       if (use_separators)
-               setlocale(LC_NUMERIC, "en_US");
-
-       while (1) {
-               swap(&prev, &record);
-               mprog_fd = value->bpf_prog.fd;
-               stats_collect(record);
-               stats_print(record, prev, prog_name, mprog_name, mprog_fd);
-               sleep(interval);
-               if (stress_mode)
-                       stress_cpumap(value);
-       }
-
-       free_stats_record(record);
-       free_stats_record(prev);
-}
-
-static int init_tracepoints(struct bpf_object *obj)
+static int set_cpumap_prog(struct xdp_redirect_cpu *skel,
+                          const char *redir_interface, const char *redir_map,
+                          const char *mprog_filename, const char *mprog_name)
 {
-       struct bpf_program *prog;
-
-       bpf_object__for_each_program(prog, obj) {
-               if (bpf_program__is_tracepoint(prog) != true)
-                       continue;
-
-               tp_links[tp_cnt] = bpf_program__attach(prog);
-               if (libbpf_get_error(tp_links[tp_cnt])) {
-                       tp_links[tp_cnt] = NULL;
-                       return -EINVAL;
+       if (mprog_filename) {
+               struct bpf_program *prog;
+               struct bpf_object *obj;
+               int ret;
+
+               if (!mprog_name) {
+                       fprintf(stderr, "BPF program not specified for file %s\n",
+                               mprog_filename);
+                       goto end;
+               }
+               if ((redir_interface && !redir_map) || (!redir_interface && redir_map)) {
+                       fprintf(stderr, "--redirect-%s specified but --redirect-%s not specified\n",
+                               redir_interface ? "device" : "map", redir_interface ? "map" : "device");
+                       goto end;
                }
-               tp_cnt++;
-       }
-
-       return 0;
-}
-
-static int init_map_fds(struct bpf_object *obj)
-{
-       enum map_type type;
-
-       for (type = 0; type < NUM_MAP; type++) {
-               map_fds[type] =
-                       bpf_object__find_map_fd_by_name(obj,
-                                                       map_type_strings[type]);
-
-               if (map_fds[type] < 0)
-                       return -ENOENT;
-       }
-
-       return 0;
-}
 
-static int load_cpumap_prog(char *file_name, char *prog_name,
-                           char *redir_interface, char *redir_map)
-{
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type              = BPF_PROG_TYPE_XDP,
-               .expected_attach_type   = BPF_XDP_CPUMAP,
-               .file = file_name,
-       };
-       struct bpf_program *prog;
-       struct bpf_object *obj;
-       int fd;
+               /* Custom BPF program */
+               obj = bpf_object__open_file(mprog_filename, NULL);
+               if (!obj) {
+                       ret = -errno;
+                       fprintf(stderr, "Failed to bpf_prog_load_xattr: %s\n",
+                               strerror(errno));
+                       return ret;
+               }
 
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &fd))
-               return -1;
+               ret = bpf_object__load(obj);
+               if (ret < 0) {
+                       ret = -errno;
+                       fprintf(stderr, "Failed to bpf_object__load: %s\n",
+                               strerror(errno));
+                       return ret;
+               }
 
-       if (fd < 0) {
-               fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
-                       strerror(errno));
-               return fd;
-       }
+               if (redir_map) {
+                       int err, redir_map_fd, ifindex_out, key = 0;
 
-       if (redir_interface && redir_map) {
-               int err, map_fd, ifindex_out, key = 0;
+                       redir_map_fd = bpf_object__find_map_fd_by_name(obj, redir_map);
+                       if (redir_map_fd < 0) {
+                               fprintf(stderr, "Failed to bpf_object__find_map_fd_by_name: %s\n",
+                                       strerror(errno));
+                               return redir_map_fd;
+                       }
 
-               map_fd = bpf_object__find_map_fd_by_name(obj, redir_map);
-               if (map_fd < 0)
-                       return map_fd;
+                       ifindex_out = if_nametoindex(redir_interface);
+                       if (!ifindex_out)
+                               ifindex_out = strtoul(redir_interface, NULL, 0);
+                       if (!ifindex_out) {
+                               fprintf(stderr, "Bad interface name or index\n");
+                               return -EINVAL;
+                       }
 
-               ifindex_out = if_nametoindex(redir_interface);
-               if (!ifindex_out)
-                       return -1;
+                       err = bpf_map_update_elem(redir_map_fd, &key, &ifindex_out, 0);
+                       if (err < 0)
+                               return err;
+               }
 
-               err = bpf_map_update_elem(map_fd, &key, &ifindex_out, 0);
-               if (err < 0)
-                       return err;
-       }
+               prog = bpf_object__find_program_by_name(obj, mprog_name);
+               if (!prog) {
+                       ret = -errno;
+                       fprintf(stderr, "Failed to bpf_object__find_program_by_name: %s\n",
+                               strerror(errno));
+                       return ret;
+               }
 
-       prog = bpf_object__find_program_by_title(obj, prog_name);
-       if (!prog) {
-               fprintf(stderr, "bpf_object__find_program_by_title failed\n");
-               return EXIT_FAIL;
+               return bpf_program__fd(prog);
+       } else {
+               if (mprog_name) {
+                       if (redir_interface || redir_map) {
+                               fprintf(stderr, "Need to specify --mprog-filename/-f\n");
+                               goto end;
+                       }
+                       if (!strcmp(mprog_name, "pass") || !strcmp(mprog_name, "drop")) {
+                               /* Use built-in pass/drop programs */
+                               return *mprog_name == 'p' ? bpf_program__fd(skel->progs.xdp_redirect_cpu_pass)
+                                       : bpf_program__fd(skel->progs.xdp_redirect_cpu_drop);
+                       } else {
+                               fprintf(stderr, "Unknown name \"%s\" for built-in BPF program\n",
+                                       mprog_name);
+                               goto end;
+                       }
+               } else {
+                       if (redir_map) {
+                               fprintf(stderr, "Need to specify --mprog-filename, --mprog-name and"
+                                       " --redirect-device with --redirect-map\n");
+                               goto end;
+                       }
+                       if (redir_interface) {
+                               /* Use built-in devmap redirect */
+                               struct bpf_devmap_val val = {};
+                               int ifindex_out, err;
+                               __u32 key = 0;
+
+                               if (!redir_interface)
+                                       return 0;
+
+                               ifindex_out = if_nametoindex(redir_interface);
+                               if (!ifindex_out)
+                                       ifindex_out = strtoul(redir_interface, NULL, 0);
+                               if (!ifindex_out) {
+                                       fprintf(stderr, "Bad interface name or index\n");
+                                       return -EINVAL;
+                               }
+
+                               if (get_mac_addr(ifindex_out, skel->bss->tx_mac_addr) < 0) {
+                                       printf("Get interface %d mac failed\n", ifindex_out);
+                                       return -EINVAL;
+                               }
+
+                               val.ifindex = ifindex_out;
+                               val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_redirect_egress_prog);
+                               err = bpf_map_update_elem(bpf_map__fd(skel->maps.tx_port), &key, &val, 0);
+                               if (err < 0)
+                                       return -errno;
+
+                               return bpf_program__fd(skel->progs.xdp_redirect_cpu_devmap);
+                       }
+               }
        }
 
-       return bpf_program__fd(prog);
+       /* Disabled */
+       return 0;
+end:
+       fprintf(stderr, "Invalid options for CPUMAP BPF program\n");
+       return -EINVAL;
 }
 
 int main(int argc, char **argv)
 {
-       char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs";
-       char *mprog_filename = "xdp_redirect_kern.o";
-       char *redir_interface = NULL, *redir_map = NULL;
-       char *mprog_name = "xdp_redirect_dummy";
-       bool mprog_disable = false;
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_UNSPEC,
-       };
-       struct bpf_prog_info info = {};
-       __u32 info_len = sizeof(info);
+       const char *redir_interface = NULL, *redir_map = NULL;
+       const char *mprog_filename = NULL, *mprog_name = NULL;
+       struct xdp_redirect_cpu *skel;
+       struct bpf_map_info info = {};
+       char ifname_buf[IF_NAMESIZE];
        struct bpf_cpumap_val value;
-       bool use_separators = true;
+       __u32 infosz = sizeof(info);
+       int ret = EXIT_FAIL_OPTION;
+       unsigned long interval = 2;
        bool stress_mode = false;
        struct bpf_program *prog;
-       struct bpf_object *obj;
-       int err = EXIT_FAIL;
-       char filename[256];
+       const char *prog_name;
+       bool generic = false;
+       bool force = false;
        int added_cpus = 0;
+       bool error = true;
        int longindex = 0;
-       int interval = 2;
        int add_cpu = -1;
-       int opt, prog_fd;
-       int *cpu, i;
+       int ifindex = -1;
+       int *cpu, i, opt;
+       char *ifname;
        __u32 qsize;
+       int n_cpus;
 
-       n_cpus = get_nprocs_conf();
+       n_cpus = libbpf_num_possible_cpus();
 
        /* Notice: Choosing the queue size is very important when CPU is
         * configured with power-saving states.
@@ -810,73 +349,87 @@ int main(int argc, char **argv)
         */
        qsize = 2048;
 
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
-
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-               return err;
-
-       if (prog_fd < 0) {
-               fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
+       skel = xdp_redirect_cpu__open();
+       if (!skel) {
+               fprintf(stderr, "Failed to xdp_redirect_cpu__open: %s\n",
                        strerror(errno));
-               return err;
+               ret = EXIT_FAIL_BPF;
+               goto end;
+       }
+
+       ret = sample_init_pre_load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       if (init_tracepoints(obj) < 0) {
-               fprintf(stderr, "ERR: bpf_program__attach failed\n");
-               return err;
+       if (bpf_map__set_max_entries(skel->maps.cpu_map, n_cpus) < 0) {
+               fprintf(stderr, "Failed to set max entries for cpu_map map: %s",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       if (init_map_fds(obj) < 0) {
-               fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
-               return err;
+       if (bpf_map__set_max_entries(skel->maps.cpus_available, n_cpus) < 0) {
+               fprintf(stderr, "Failed to set max entries for cpus_available map: %s",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
-       mark_cpus_unavailable();
 
-       cpu = malloc(n_cpus * sizeof(int));
+       cpu = calloc(n_cpus, sizeof(int));
        if (!cpu) {
-               fprintf(stderr, "failed to allocate cpu array\n");
-               return err;
+               fprintf(stderr, "Failed to allocate cpu array\n");
+               goto end_destroy;
        }
-       memset(cpu, 0, n_cpus * sizeof(int));
 
-       /* Parse commands line args */
-       while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:n",
+       prog = skel->progs.xdp_prognum5_lb_hash_ip_pairs;
+       while ((opt = getopt_long(argc, argv, "d:si:Sxp:f:e:r:m:c:q:Fvh",
                                  long_options, &longindex)) != -1) {
                switch (opt) {
                case 'd':
                        if (strlen(optarg) >= IF_NAMESIZE) {
-                               fprintf(stderr, "ERR: --dev name too long\n");
-                               goto error;
+                               fprintf(stderr, "-d/--dev name too long\n");
+                               goto end_cpu;
                        }
                        ifname = (char *)&ifname_buf;
-                       strncpy(ifname, optarg, IF_NAMESIZE);
+                       safe_strncpy(ifname, optarg, sizeof(ifname));
                        ifindex = if_nametoindex(ifname);
-                       if (ifindex == 0) {
-                               fprintf(stderr,
-                                       "ERR: --dev name unknown err(%d):%s\n",
+                       if (!ifindex)
+                               ifindex = strtoul(optarg, NULL, 0);
+                       if (!ifindex) {
+                               fprintf(stderr, "Bad interface index or name (%d): %s\n",
                                        errno, strerror(errno));
-                               goto error;
+                               usage(argv, long_options, __doc__, mask, true, skel->obj);
+                               goto end_cpu;
                        }
                        break;
                case 's':
-                       interval = atoi(optarg);
+                       mask |= SAMPLE_REDIRECT_MAP_CNT;
+                       break;
+               case 'i':
+                       interval = strtoul(optarg, NULL, 0);
                        break;
                case 'S':
-                       xdp_flags |= XDP_FLAGS_SKB_MODE;
+                       generic = true;
                        break;
                case 'x':
                        stress_mode = true;
                        break;
-               case 'z':
-                       use_separators = false;
-                       break;
                case 'p':
                        /* Selecting eBPF prog to load */
                        prog_name = optarg;
-                       break;
-               case 'n':
-                       mprog_disable = true;
+                       prog = bpf_object__find_program_by_name(skel->obj,
+                                                               prog_name);
+                       if (!prog) {
+                               fprintf(stderr,
+                                       "Failed to find program %s specified by"
+                                       " option -p/--progname\n",
+                                       prog_name);
+                               print_avail_progs(skel->obj);
+                               goto end_cpu;
+                       }
                        break;
                case 'f':
                        mprog_filename = optarg;
@@ -886,6 +439,7 @@ int main(int argc, char **argv)
                        break;
                case 'r':
                        redir_interface = optarg;
+                       mask |= SAMPLE_DEVMAP_XMIT_CNT_MULTI;
                        break;
                case 'm':
                        redir_map = optarg;
@@ -895,93 +449,115 @@ int main(int argc, char **argv)
                        add_cpu = strtoul(optarg, NULL, 0);
                        if (add_cpu >= n_cpus) {
                                fprintf(stderr,
-                               "--cpu nr too large for cpumap err(%d):%s\n",
+                               "--cpu nr too large for cpumap err (%d):%s\n",
                                        errno, strerror(errno));
-                               goto error;
+                               usage(argv, long_options, __doc__, mask, true, skel->obj);
+                               goto end_cpu;
                        }
                        cpu[added_cpus++] = add_cpu;
                        break;
                case 'q':
-                       qsize = atoi(optarg);
+                       qsize = strtoul(optarg, NULL, 0);
                        break;
                case 'F':
-                       xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+                       force = true;
+                       break;
+               case 'v':
+                       sample_switch_mode();
                        break;
                case 'h':
-               error:
+                       error = false;
                default:
-                       free(cpu);
-                       usage(argv, obj);
-                       return EXIT_FAIL_OPTION;
+                       usage(argv, long_options, __doc__, mask, error, skel->obj);
+                       goto end_cpu;
                }
        }
 
-       if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
-               xdp_flags |= XDP_FLAGS_DRV_MODE;
-
-       /* Required option */
+       ret = EXIT_FAIL_OPTION;
        if (ifindex == -1) {
-               fprintf(stderr, "ERR: required option --dev missing\n");
-               usage(argv, obj);
-               err = EXIT_FAIL_OPTION;
-               goto out;
+               fprintf(stderr, "Required option --dev missing\n");
+               usage(argv, long_options, __doc__, mask, true, skel->obj);
+               goto end_cpu;
        }
-       /* Required option */
+
        if (add_cpu == -1) {
-               fprintf(stderr, "ERR: required option --cpu missing\n");
-               fprintf(stderr, " Specify multiple --cpu option to add more\n");
-               usage(argv, obj);
-               err = EXIT_FAIL_OPTION;
-               goto out;
+               fprintf(stderr, "Required option --cpu missing\n"
+                               "Specify multiple --cpu option to add more\n");
+               usage(argv, long_options, __doc__, mask, true, skel->obj);
+               goto end_cpu;
        }
 
-       value.bpf_prog.fd = 0;
-       if (!mprog_disable)
-               value.bpf_prog.fd = load_cpumap_prog(mprog_filename, mprog_name,
-                                                    redir_interface, redir_map);
-       if (value.bpf_prog.fd < 0) {
-               err = value.bpf_prog.fd;
-               goto out;
+       skel->rodata->from_match[0] = ifindex;
+       if (redir_interface)
+               skel->rodata->to_match[0] = if_nametoindex(redir_interface);
+
+       ret = xdp_redirect_cpu__load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to xdp_redirect_cpu__load: %s\n",
+                       strerror(errno));
+               goto end_cpu;
        }
-       value.qsize = qsize;
 
-       for (i = 0; i < added_cpus; i++)
-               create_cpu_entry(cpu[i], &value, i, true);
+       ret = bpf_obj_get_info_by_fd(bpf_map__fd(skel->maps.cpu_map), &info, &infosz);
+       if (ret < 0) {
+               fprintf(stderr, "Failed bpf_obj_get_info_by_fd for cpumap: %s\n",
+                       strerror(errno));
+               goto end_cpu;
+       }
 
-       /* Remove XDP program when program is interrupted or killed */
-       signal(SIGINT, int_exit);
-       signal(SIGTERM, int_exit);
+       skel->bss->cpumap_map_id = info.id;
 
-       prog = bpf_object__find_program_by_title(obj, prog_name);
-       if (!prog) {
-               fprintf(stderr, "bpf_object__find_program_by_title failed\n");
-               goto out;
+       map_fd = bpf_map__fd(skel->maps.cpu_map);
+       avail_fd = bpf_map__fd(skel->maps.cpus_available);
+       count_fd = bpf_map__fd(skel->maps.cpus_count);
+
+       ret = mark_cpus_unavailable();
+       if (ret < 0) {
+               fprintf(stderr, "Unable to mark CPUs as unavailable\n");
+               goto end_cpu;
        }
 
-       prog_fd = bpf_program__fd(prog);
-       if (prog_fd < 0) {
-               fprintf(stderr, "bpf_program__fd failed\n");
-               goto out;
+       ret = sample_init(skel, mask);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_cpu;
        }
 
-       if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
-               fprintf(stderr, "link set xdp fd failed\n");
-               err = EXIT_FAIL_XDP;
-               goto out;
+       value.bpf_prog.fd = set_cpumap_prog(skel, redir_interface, redir_map,
+                                           mprog_filename, mprog_name);
+       if (value.bpf_prog.fd < 0) {
+               fprintf(stderr, "Failed to set CPUMAP BPF program: %s\n",
+                       strerror(-value.bpf_prog.fd));
+               usage(argv, long_options, __doc__, mask, true, skel->obj);
+               ret = EXIT_FAIL_BPF;
+               goto end_cpu;
        }
+       value.qsize = qsize;
 
-       err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-       if (err) {
-               printf("can't get prog info - %s\n", strerror(errno));
-               goto out;
+       for (i = 0; i < added_cpus; i++) {
+               if (create_cpu_entry(cpu[i], &value, i, true) < 0) {
+                       fprintf(stderr, "Cannot proceed, exiting\n");
+                       usage(argv, long_options, __doc__, mask, true, skel->obj);
+                       goto end_cpu;
+               }
        }
-       prog_id = info.id;
 
-       stats_poll(interval, use_separators, prog_name, mprog_name,
-                  &value, stress_mode);
+       ret = EXIT_FAIL_XDP;
+       if (sample_install_xdp(prog, ifindex, generic, force) < 0)
+               goto end_cpu;
 
-       err = EXIT_OK;
-out:
+       ret = sample_run(interval, stress_mode ? stress_cpumap : NULL, &value);
+       if (ret < 0) {
+               fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_cpu;
+       }
+       ret = EXIT_OK;
+end_cpu:
        free(cpu);
-       return err;
+end_destroy:
+       xdp_redirect_cpu__destroy(skel);
+end:
+       sample_exit(ret);
 }
diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c
deleted file mode 100644 (file)
index d26ec3a..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include <linux/in.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_vlan.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <bpf/bpf_helpers.h>
-
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __type(key, int);
-       __type(value, int);
-       __uint(max_entries, 1);
-} tx_port SEC(".maps");
-
-/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
- * feedback.  Redirect TX errors can be caught via a tracepoint.
- */
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, long);
-       __uint(max_entries, 1);
-} rxcnt SEC(".maps");
-
-static void swap_src_dst_mac(void *data)
-{
-       unsigned short *p = data;
-       unsigned short dst[3];
-
-       dst[0] = p[0];
-       dst[1] = p[1];
-       dst[2] = p[2];
-       p[0] = p[3];
-       p[1] = p[4];
-       p[2] = p[5];
-       p[3] = dst[0];
-       p[4] = dst[1];
-       p[5] = dst[2];
-}
-
-SEC("xdp_redirect")
-int xdp_redirect_prog(struct xdp_md *ctx)
-{
-       void *data_end = (void *)(long)ctx->data_end;
-       void *data = (void *)(long)ctx->data;
-       struct ethhdr *eth = data;
-       int rc = XDP_DROP;
-       int *ifindex, port = 0;
-       long *value;
-       u32 key = 0;
-       u64 nh_off;
-
-       nh_off = sizeof(*eth);
-       if (data + nh_off > data_end)
-               return rc;
-
-       ifindex = bpf_map_lookup_elem(&tx_port, &port);
-       if (!ifindex)
-               return rc;
-
-       value = bpf_map_lookup_elem(&rxcnt, &key);
-       if (value)
-               *value += 1;
-
-       swap_src_dst_mac(data);
-       return bpf_redirect(*ifindex, 0);
-}
-
-/* Redirect require an XDP bpf_prog loaded on the TX device */
-SEC("xdp_redirect_dummy")
-int xdp_redirect_dummy_prog(struct xdp_md *ctx)
-{
-       return XDP_PASS;
-}
-
-char _license[] SEC("license") = "GPL";
similarity index 57%
rename from samples/bpf/xdp_redirect_map_kern.c
rename to samples/bpf/xdp_redirect_map.bpf.c
index a92b8e5..59efd65 100644 (file)
  * General Public License for more details.
  */
 #define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include <linux/in.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_vlan.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <bpf/bpf_helpers.h>
+
+#include "vmlinux.h"
+#include "xdp_sample.bpf.h"
+#include "xdp_sample_shared.h"
 
 /* The 2nd xdp prog on egress does not support skb mode, so we define two
  * maps, tx_port_general and tx_port_native.
@@ -26,114 +22,71 @@ struct {
        __uint(type, BPF_MAP_TYPE_DEVMAP);
        __uint(key_size, sizeof(int));
        __uint(value_size, sizeof(int));
-       __uint(max_entries, 100);
+       __uint(max_entries, 1);
 } tx_port_general SEC(".maps");
 
 struct {
        __uint(type, BPF_MAP_TYPE_DEVMAP);
        __uint(key_size, sizeof(int));
        __uint(value_size, sizeof(struct bpf_devmap_val));
-       __uint(max_entries, 100);
-} tx_port_native SEC(".maps");
-
-/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
- * feedback.  Redirect TX errors can be caught via a tracepoint.
- */
-struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, long);
-       __uint(max_entries, 1);
-} rxcnt SEC(".maps");
-
-/* map to store egress interface mac address */
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __type(key, u32);
-       __type(value, __be64);
        __uint(max_entries, 1);
-} tx_mac SEC(".maps");
-
-static void swap_src_dst_mac(void *data)
-{
-       unsigned short *p = data;
-       unsigned short dst[3];
+} tx_port_native SEC(".maps");
 
-       dst[0] = p[0];
-       dst[1] = p[1];
-       dst[2] = p[2];
-       p[0] = p[3];
-       p[1] = p[4];
-       p[2] = p[5];
-       p[3] = dst[0];
-       p[4] = dst[1];
-       p[5] = dst[2];
-}
+/* store egress interface mac address */
+const volatile char tx_mac_addr[ETH_ALEN];
 
 static __always_inline int xdp_redirect_map(struct xdp_md *ctx, void *redirect_map)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data = (void *)(long)ctx->data;
+       u32 key = bpf_get_smp_processor_id();
        struct ethhdr *eth = data;
-       int rc = XDP_DROP;
-       long *value;
-       u32 key = 0;
+       struct datarec *rec;
        u64 nh_off;
-       int vport;
 
        nh_off = sizeof(*eth);
        if (data + nh_off > data_end)
-               return rc;
-
-       /* constant virtual port */
-       vport = 0;
-
-       /* count packet in global counter */
-       value = bpf_map_lookup_elem(&rxcnt, &key);
-       if (value)
-               *value += 1;
+               return XDP_DROP;
 
+       rec = bpf_map_lookup_elem(&rx_cnt, &key);
+       if (!rec)
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
        swap_src_dst_mac(data);
-
-       /* send packet out physical port */
-       return bpf_redirect_map(redirect_map, vport, 0);
+       return bpf_redirect_map(redirect_map, 0, 0);
 }
 
-SEC("xdp_redirect_general")
+SEC("xdp")
 int xdp_redirect_map_general(struct xdp_md *ctx)
 {
        return xdp_redirect_map(ctx, &tx_port_general);
 }
 
-SEC("xdp_redirect_native")
+SEC("xdp")
 int xdp_redirect_map_native(struct xdp_md *ctx)
 {
        return xdp_redirect_map(ctx, &tx_port_native);
 }
 
-SEC("xdp_devmap/map_prog")
+SEC("xdp_devmap/egress")
 int xdp_redirect_map_egress(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
        void *data = (void *)(long)ctx->data;
        struct ethhdr *eth = data;
-       __be64 *mac;
-       u32 key = 0;
        u64 nh_off;
 
        nh_off = sizeof(*eth);
        if (data + nh_off > data_end)
                return XDP_DROP;
 
-       mac = bpf_map_lookup_elem(&tx_mac, &key);
-       if (mac)
-               __builtin_memcpy(eth->h_source, mac, ETH_ALEN);
+       __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN);
 
        return XDP_PASS;
 }
 
 /* Redirect require an XDP bpf_prog loaded on the TX device */
-SEC("xdp_redirect_dummy")
+SEC("xdp")
 int xdp_redirect_dummy_prog(struct xdp_md *ctx)
 {
        return XDP_PASS;
similarity index 64%
rename from samples/bpf/xdp_redirect_map_multi_kern.c
rename to samples/bpf/xdp_redirect_map_multi.bpf.c
index 71aa23d..8f59d43 100644 (file)
@@ -1,11 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 #define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include <linux/in.h>
-#include <linux/if_ether.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <bpf/bpf_helpers.h>
+
+#include "vmlinux.h"
+#include "xdp_sample.bpf.h"
+#include "xdp_sample_shared.h"
+
+enum {
+       BPF_F_BROADCAST         = (1ULL << 3),
+       BPF_F_EXCLUDE_INGRESS   = (1ULL << 4),
+};
 
 struct {
        __uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
@@ -21,50 +24,41 @@ struct {
        __uint(max_entries, 32);
 } forward_map_native SEC(".maps");
 
+/* map to store egress interfaces mac addresses */
 struct {
-       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-       __type(key, u32);
-       __type(value, long);
-       __uint(max_entries, 1);
-} rxcnt SEC(".maps");
-
-/* map to store egress interfaces mac addresses, set the
- * max_entries to 1 and extend it in user sapce prog.
- */
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(type, BPF_MAP_TYPE_HASH);
        __type(key, u32);
        __type(value, __be64);
-       __uint(max_entries, 1);
+       __uint(max_entries, 32);
 } mac_map SEC(".maps");
 
 static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map)
 {
-       long *value;
-       u32 key = 0;
+       u32 key = bpf_get_smp_processor_id();
+       struct datarec *rec;
 
-       /* count packet in global counter */
-       value = bpf_map_lookup_elem(&rxcnt, &key);
-       if (value)
-               *value += 1;
+       rec = bpf_map_lookup_elem(&rx_cnt, &key);
+       if (!rec)
+               return XDP_PASS;
+       NO_TEAR_INC(rec->processed);
 
-       return bpf_redirect_map(forward_map, key,
+       return bpf_redirect_map(forward_map, 0,
                                BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
 }
 
-SEC("xdp_redirect_general")
+SEC("xdp")
 int xdp_redirect_map_general(struct xdp_md *ctx)
 {
        return xdp_redirect_map(ctx, &forward_map_general);
 }
 
-SEC("xdp_redirect_native")
+SEC("xdp")
 int xdp_redirect_map_native(struct xdp_md *ctx)
 {
        return xdp_redirect_map(ctx, &forward_map_native);
 }
 
-SEC("xdp_devmap/map_prog")
+SEC("xdp_devmap/egress")
 int xdp_devmap_prog(struct xdp_md *ctx)
 {
        void *data_end = (void *)(long)ctx->data_end;
index 84cdbbe..3153147 100644 (file)
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
+static const char *__doc__ =
+"XDP multi redirect tool, using BPF_MAP_TYPE_DEVMAP and BPF_F_BROADCAST flag for bpf_redirect_map\n"
+"Usage: xdp_redirect_map_multi <IFINDEX|IFNAME> <IFINDEX|IFNAME> ... <IFINDEX|IFNAME>\n";
+
 #include <linux/bpf.h>
 #include <linux/if_link.h>
 #include <assert.h>
+#include <getopt.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
-
-#include "bpf_util.h"
+#include <linux/if_ether.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include "xdp_sample_user.h"
+#include "xdp_redirect_map_multi.skel.h"
 
 #define MAX_IFACE_NUM 32
-
-static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 static int ifaces[MAX_IFACE_NUM] = {};
-static int rxcnt_map_fd;
-
-static void int_exit(int sig)
-{
-       __u32 prog_id = 0;
-       int i;
-
-       for (i = 0; ifaces[i] > 0; i++) {
-               if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
-                       exit(1);
-               }
-               if (prog_id)
-                       bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
-       }
-
-       exit(0);
-}
-
-static void poll_stats(int interval)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       __u64 values[nr_cpus], prev[nr_cpus];
-
-       memset(prev, 0, sizeof(prev));
-
-       while (1) {
-               __u64 sum = 0;
-               __u32 key = 0;
-               int i;
 
-               sleep(interval);
-               assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
-               for (i = 0; i < nr_cpus; i++)
-                       sum += (values[i] - prev[i]);
-               if (sum)
-                       printf("Forwarding %10llu pkt/s\n", sum / interval);
-               memcpy(prev, values, sizeof(values));
-       }
-}
-
-static int get_mac_addr(unsigned int ifindex, void *mac_addr)
-{
-       char ifname[IF_NAMESIZE];
-       struct ifreq ifr;
-       int fd, ret = -1;
-
-       fd = socket(AF_INET, SOCK_DGRAM, 0);
-       if (fd < 0)
-               return ret;
-
-       if (!if_indextoname(ifindex, ifname))
-               goto err_out;
-
-       strcpy(ifr.ifr_name, ifname);
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT |
+                 SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT |
+                 SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_SKIP_HEADING;
 
-       if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
-               goto err_out;
+DEFINE_SAMPLE_INIT(xdp_redirect_map_multi);
 
-       memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
-       ret = 0;
+static const struct option long_options[] = {
+       { "help", no_argument, NULL, 'h' },
+       { "skb-mode", no_argument, NULL, 'S' },
+       { "force", no_argument, NULL, 'F' },
+       { "load-egress", no_argument, NULL, 'X' },
+       { "stats", no_argument, NULL, 's' },
+       { "interval", required_argument, NULL, 'i' },
+       { "verbose", no_argument, NULL, 'v' },
+       {}
+};
 
-err_out:
-       close(fd);
-       return ret;
-}
-
-static int update_mac_map(struct bpf_object *obj)
+static int update_mac_map(struct bpf_map *map)
 {
-       int i, ret = -1, mac_map_fd;
+       int mac_map_fd = bpf_map__fd(map);
        unsigned char mac_addr[6];
        unsigned int ifindex;
-
-       mac_map_fd = bpf_object__find_map_fd_by_name(obj, "mac_map");
-       if (mac_map_fd < 0) {
-               printf("find mac map fd failed\n");
-               return ret;
-       }
+       int i, ret = -1;
 
        for (i = 0; ifaces[i] > 0; i++) {
                ifindex = ifaces[i];
 
                ret = get_mac_addr(ifindex, mac_addr);
                if (ret < 0) {
-                       printf("get interface %d mac failed\n", ifindex);
+                       fprintf(stderr, "get interface %d mac failed\n",
+                               ifindex);
                        return ret;
                }
 
                ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0);
-               if (ret) {
-                       perror("bpf_update_elem mac_map_fd");
+               if (ret < 0) {
+                       fprintf(stderr, "Failed to update mac address for ifindex %d\n",
+                               ifindex);
                        return ret;
                }
        }
@@ -122,181 +75,159 @@ static int update_mac_map(struct bpf_object *obj)
        return 0;
 }
 
-static void usage(const char *prog)
-{
-       fprintf(stderr,
-               "usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
-               "OPTS:\n"
-               "    -S    use skb-mode\n"
-               "    -N    enforce native mode\n"
-               "    -F    force loading prog\n"
-               "    -X    load xdp program on egress\n",
-               prog);
-}
-
 int main(int argc, char **argv)
 {
-       int i, ret, opt, forward_map_fd, max_ifindex = 0;
-       struct bpf_program *ingress_prog, *egress_prog;
-       int ingress_prog_fd, egress_prog_fd = 0;
-       struct bpf_devmap_val devmap_val;
-       bool attach_egress_prog = false;
+       struct bpf_devmap_val devmap_val = {};
+       struct xdp_redirect_map_multi *skel;
+       struct bpf_program *ingress_prog;
+       bool xdp_devmap_attached = false;
+       struct bpf_map *forward_map;
+       int ret = EXIT_FAIL_OPTION;
+       unsigned long interval = 2;
        char ifname[IF_NAMESIZE];
-       struct bpf_map *mac_map;
-       struct bpf_object *obj;
        unsigned int ifindex;
-       char filename[256];
-
-       while ((opt = getopt(argc, argv, "SNFX")) != -1) {
+       bool generic = false;
+       bool force = false;
+       bool tried = false;
+       bool error = true;
+       int i, opt;
+
+       while ((opt = getopt_long(argc, argv, "hSFXi:vs",
+                                 long_options, NULL)) != -1) {
                switch (opt) {
                case 'S':
-                       xdp_flags |= XDP_FLAGS_SKB_MODE;
-                       break;
-               case 'N':
-                       /* default, set below */
+                       generic = true;
+                       /* devmap_xmit tracepoint not available */
+                       mask &= ~(SAMPLE_DEVMAP_XMIT_CNT |
+                                 SAMPLE_DEVMAP_XMIT_CNT_MULTI);
                        break;
                case 'F':
-                       xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+                       force = true;
                        break;
                case 'X':
-                       attach_egress_prog = true;
+                       xdp_devmap_attached = true;
+                       break;
+               case 'i':
+                       interval = strtoul(optarg, NULL, 0);
+                       break;
+               case 'v':
+                       sample_switch_mode();
                        break;
+               case 's':
+                       mask |= SAMPLE_REDIRECT_MAP_CNT;
+                       break;
+               case 'h':
+                       error = false;
                default:
-                       usage(basename(argv[0]));
-                       return 1;
+                       sample_usage(argv, long_options, __doc__, mask, error);
+                       return ret;
                }
        }
 
-       if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
-               xdp_flags |= XDP_FLAGS_DRV_MODE;
-       } else if (attach_egress_prog) {
-               printf("Load xdp program on egress with SKB mode not supported yet\n");
-               return 1;
+       if (argc <= optind + 1) {
+               sample_usage(argv, long_options, __doc__, mask, error);
+               return ret;
        }
 
-       if (optind == argc) {
-               printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
-               return 1;
+       skel = xdp_redirect_map_multi__open();
+       if (!skel) {
+               fprintf(stderr, "Failed to xdp_redirect_map_multi__open: %s\n",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end;
        }
 
-       printf("Get interfaces");
+       ret = sample_init_pre_load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
+       }
+
+       ret = EXIT_FAIL_OPTION;
        for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
                ifaces[i] = if_nametoindex(argv[optind + i]);
                if (!ifaces[i])
                        ifaces[i] = strtoul(argv[optind + i], NULL, 0);
                if (!if_indextoname(ifaces[i], ifname)) {
-                       perror("Invalid interface name or i");
-                       return 1;
+                       fprintf(stderr, "Bad interface index or name\n");
+                       sample_usage(argv, long_options, __doc__, mask, true);
+                       goto end_destroy;
                }
 
-               /* Find the largest index number */
-               if (ifaces[i] > max_ifindex)
-                       max_ifindex = ifaces[i];
-
-               printf(" %d", ifaces[i]);
-       }
-       printf("\n");
-
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-       obj = bpf_object__open(filename);
-       if (libbpf_get_error(obj)) {
-               printf("ERROR: opening BPF object file failed\n");
-               obj = NULL;
-               goto err_out;
+               skel->rodata->from_match[i] = ifaces[i];
+               skel->rodata->to_match[i] = ifaces[i];
        }
 
-       /* Reset the map size to max ifindex + 1 */
-       if (attach_egress_prog) {
-               mac_map = bpf_object__find_map_by_name(obj, "mac_map");
-               ret = bpf_map__resize(mac_map, max_ifindex + 1);
-               if (ret < 0) {
-                       printf("ERROR: reset mac map size failed\n");
-                       goto err_out;
-               }
+       ret = xdp_redirect_map_multi__load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to xdp_redirect_map_multi__load: %s\n",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       /* load BPF program */
-       if (bpf_object__load(obj)) {
-               printf("ERROR: loading BPF object file failed\n");
-               goto err_out;
-       }
-
-       if (xdp_flags & XDP_FLAGS_SKB_MODE) {
-               ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general");
-               forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_general");
-       } else {
-               ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native");
-               forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_native");
-       }
-       if (!ingress_prog || forward_map_fd < 0) {
-               printf("finding ingress_prog/forward_map in obj file failed\n");
-               goto err_out;
-       }
-
-       ingress_prog_fd = bpf_program__fd(ingress_prog);
-       if (ingress_prog_fd < 0) {
-               printf("find ingress_prog fd failed\n");
-               goto err_out;
-       }
-
-       rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
-       if (rxcnt_map_fd < 0) {
-               printf("bpf_object__find_map_fd_by_name failed\n");
-               goto err_out;
-       }
-
-       if (attach_egress_prog) {
+       if (xdp_devmap_attached) {
                /* Update mac_map with all egress interfaces' mac addr */
-               if (update_mac_map(obj) < 0) {
-                       printf("Error: update mac map failed");
-                       goto err_out;
+               if (update_mac_map(skel->maps.mac_map) < 0) {
+                       fprintf(stderr, "Updating mac address failed\n");
+                       ret = EXIT_FAIL;
+                       goto end_destroy;
                }
+       }
 
-               /* Find egress prog fd */
-               egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
-               if (!egress_prog) {
-                       printf("finding egress_prog in obj file failed\n");
-                       goto err_out;
-               }
-               egress_prog_fd = bpf_program__fd(egress_prog);
-               if (egress_prog_fd < 0) {
-                       printf("find egress_prog fd failed\n");
-                       goto err_out;
-               }
+       ret = sample_init(skel, mask);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
        }
 
-       /* Remove attached program when program is interrupted or killed */
-       signal(SIGINT, int_exit);
-       signal(SIGTERM, int_exit);
+       ingress_prog = skel->progs.xdp_redirect_map_native;
+       forward_map = skel->maps.forward_map_native;
 
-       /* Init forward multicast groups */
        for (i = 0; ifaces[i] > 0; i++) {
                ifindex = ifaces[i];
 
+               ret = EXIT_FAIL_XDP;
+restart:
                /* bind prog_fd to each interface */
-               ret = bpf_set_link_xdp_fd(ifindex, ingress_prog_fd, xdp_flags);
-               if (ret) {
-                       printf("Set xdp fd failed on %d\n", ifindex);
-                       goto err_out;
+               if (sample_install_xdp(ingress_prog, ifindex, generic, force) < 0) {
+                       if (generic && !tried) {
+                               fprintf(stderr,
+                                       "Trying fallback to sizeof(int) as value_size for devmap in generic mode\n");
+                               ingress_prog = skel->progs.xdp_redirect_map_general;
+                               forward_map = skel->maps.forward_map_general;
+                               tried = true;
+                               goto restart;
+                       }
+                       goto end_destroy;
                }
 
                /* Add all the interfaces to forward group and attach
-                * egress devmap programe if exist
+                * egress devmap program if exist
                 */
                devmap_val.ifindex = ifindex;
-               devmap_val.bpf_prog.fd = egress_prog_fd;
-               ret = bpf_map_update_elem(forward_map_fd, &ifindex, &devmap_val, 0);
-               if (ret) {
-                       perror("bpf_map_update_elem forward_map");
-                       goto err_out;
+               if (xdp_devmap_attached)
+                       devmap_val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_devmap_prog);
+               ret = bpf_map_update_elem(bpf_map__fd(forward_map), &ifindex, &devmap_val, 0);
+               if (ret < 0) {
+                       fprintf(stderr, "Failed to update devmap value: %s\n",
+                               strerror(errno));
+                       ret = EXIT_FAIL_BPF;
+                       goto end_destroy;
                }
        }
 
-       poll_stats(2);
-
-       return 0;
-
-err_out:
-       return 1;
+       ret = sample_run(interval, NULL, NULL);
+       if (ret < 0) {
+               fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
+       }
+       ret = EXIT_OK;
+end_destroy:
+       xdp_redirect_map_multi__destroy(skel);
+end:
+       sample_exit(ret);
 }
index 0e81926..b6e4fc8 100644 (file)
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  */
+static const char *__doc__ =
+"XDP redirect tool, using BPF_MAP_TYPE_DEVMAP\n"
+"Usage: xdp_redirect_map <IFINDEX|IFNAME>_IN <IFINDEX|IFNAME>_OUT\n";
+
 #include <linux/bpf.h>
 #include <linux/if_link.h>
 #include <assert.h>
 #include <net/if.h>
 #include <unistd.h>
 #include <libgen.h>
-#include <sys/resource.h>
-#include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-
-#include "bpf_util.h"
+#include <getopt.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include "xdp_sample_user.h"
+#include "xdp_redirect_map.skel.h"
 
-static int ifindex_in;
-static int ifindex_out;
-static bool ifindex_out_xdp_dummy_attached = true;
-static bool xdp_devmap_attached;
-static __u32 prog_id;
-static __u32 dummy_prog_id;
-
-static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-static int rxcnt_map_fd;
-
-static void int_exit(int sig)
-{
-       __u32 curr_prog_id = 0;
-
-       if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) {
-               printf("bpf_get_link_xdp_id failed\n");
-               exit(1);
-       }
-       if (prog_id == curr_prog_id)
-               bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
-       else if (!curr_prog_id)
-               printf("couldn't find a prog id on iface IN\n");
-       else
-               printf("program on iface IN changed, not removing\n");
-
-       if (ifindex_out_xdp_dummy_attached) {
-               curr_prog_id = 0;
-               if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id,
-                                       xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
-                       exit(1);
-               }
-               if (dummy_prog_id == curr_prog_id)
-                       bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
-               else if (!curr_prog_id)
-                       printf("couldn't find a prog id on iface OUT\n");
-               else
-                       printf("program on iface OUT changed, not removing\n");
-       }
-       exit(0);
-}
-
-static void poll_stats(int interval, int ifindex)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       __u64 values[nr_cpus], prev[nr_cpus];
-
-       memset(prev, 0, sizeof(prev));
-
-       while (1) {
-               __u64 sum = 0;
-               __u32 key = 0;
-               int i;
-
-               sleep(interval);
-               assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
-               for (i = 0; i < nr_cpus; i++)
-                       sum += (values[i] - prev[i]);
-               if (sum)
-                       printf("ifindex %i: %10llu pkt/s\n",
-                              ifindex, sum / interval);
-               memcpy(prev, values, sizeof(values));
-       }
-}
-
-static int get_mac_addr(unsigned int ifindex_out, void *mac_addr)
-{
-       char ifname[IF_NAMESIZE];
-       struct ifreq ifr;
-       int fd, ret = -1;
-
-       fd = socket(AF_INET, SOCK_DGRAM, 0);
-       if (fd < 0)
-               return ret;
-
-       if (!if_indextoname(ifindex_out, ifname))
-               goto err_out;
-
-       strcpy(ifr.ifr_name, ifname);
-
-       if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
-               goto err_out;
-
-       memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
-       ret = 0;
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT |
+                 SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI;
 
-err_out:
-       close(fd);
-       return ret;
-}
+DEFINE_SAMPLE_INIT(xdp_redirect_map);
 
-static void usage(const char *prog)
-{
-       fprintf(stderr,
-               "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
-               "OPTS:\n"
-               "    -S    use skb-mode\n"
-               "    -N    enforce native mode\n"
-               "    -F    force loading prog\n"
-               "    -X    load xdp program on egress\n",
-               prog);
-}
+static const struct option long_options[] = {
+       { "help", no_argument, NULL, 'h' },
+       { "skb-mode", no_argument, NULL, 'S' },
+       { "force", no_argument, NULL, 'F' },
+       { "load-egress", no_argument, NULL, 'X' },
+       { "stats", no_argument, NULL, 's' },
+       { "interval", required_argument, NULL, 'i' },
+       { "verbose", no_argument, NULL, 'v' },
+       {}
+};
 
 int main(int argc, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_UNSPEC,
-       };
-       struct bpf_program *prog, *dummy_prog, *devmap_prog;
-       int prog_fd, dummy_prog_fd, devmap_prog_fd = 0;
-       int tx_port_map_fd, tx_mac_map_fd;
-       struct bpf_devmap_val devmap_val;
-       struct bpf_prog_info info = {};
-       __u32 info_len = sizeof(info);
-       const char *optstr = "FSNX";
-       struct bpf_object *obj;
-       int ret, opt, key = 0;
-       char filename[256];
-
-       while ((opt = getopt(argc, argv, optstr)) != -1) {
+       struct bpf_devmap_val devmap_val = {};
+       bool xdp_devmap_attached = false;
+       struct xdp_redirect_map *skel;
+       char str[2 * IF_NAMESIZE + 1];
+       char ifname_out[IF_NAMESIZE];
+       struct bpf_map *tx_port_map;
+       char ifname_in[IF_NAMESIZE];
+       int ifindex_in, ifindex_out;
+       unsigned long interval = 2;
+       int ret = EXIT_FAIL_OPTION;
+       struct bpf_program *prog;
+       bool generic = false;
+       bool force = false;
+       bool tried = false;
+       bool error = true;
+       int opt, key = 0;
+
+       while ((opt = getopt_long(argc, argv, "hSFXi:vs",
+                                 long_options, NULL)) != -1) {
                switch (opt) {
                case 'S':
-                       xdp_flags |= XDP_FLAGS_SKB_MODE;
-                       break;
-               case 'N':
-                       /* default, set below */
+                       generic = true;
+                       /* devmap_xmit tracepoint not available */
+                       mask &= ~(SAMPLE_DEVMAP_XMIT_CNT |
+                                 SAMPLE_DEVMAP_XMIT_CNT_MULTI);
                        break;
                case 'F':
-                       xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+                       force = true;
                        break;
                case 'X':
                        xdp_devmap_attached = true;
                        break;
+               case 'i':
+                       interval = strtoul(optarg, NULL, 0);
+                       break;
+               case 'v':
+                       sample_switch_mode();
+                       break;
+               case 's':
+                       mask |= SAMPLE_REDIRECT_MAP_CNT;
+                       break;
+               case 'h':
+                       error = false;
                default:
-                       usage(basename(argv[0]));
-                       return 1;
+                       sample_usage(argv, long_options, __doc__, mask, error);
+                       return ret;
                }
        }
 
-       if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
-               xdp_flags |= XDP_FLAGS_DRV_MODE;
-       } else if (xdp_devmap_attached) {
-               printf("Load xdp program on egress with SKB mode not supported yet\n");
-               return 1;
-       }
-
-       if (optind == argc) {
-               printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
-               return 1;
+       if (argc <= optind + 1) {
+               sample_usage(argv, long_options, __doc__, mask, true);
+               goto end;
        }
 
        ifindex_in = if_nametoindex(argv[optind]);
@@ -182,107 +104,116 @@ int main(int argc, char **argv)
        if (!ifindex_out)
                ifindex_out = strtoul(argv[optind + 1], NULL, 0);
 
-       printf("input: %d output: %d\n", ifindex_in, ifindex_out);
-
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
-
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-               return 1;
-
-       if (xdp_flags & XDP_FLAGS_SKB_MODE) {
-               prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general");
-               tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port_general");
-       } else {
-               prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native");
-               tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port_native");
-       }
-       dummy_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_dummy_prog");
-       if (!prog || dummy_prog < 0 || tx_port_map_fd < 0) {
-               printf("finding prog/dummy_prog/tx_port_map in obj file failed\n");
-               goto out;
-       }
-       prog_fd = bpf_program__fd(prog);
-       dummy_prog_fd = bpf_program__fd(dummy_prog);
-       if (prog_fd < 0 || dummy_prog_fd < 0 || tx_port_map_fd < 0) {
-               printf("bpf_prog_load_xattr: %s\n", strerror(errno));
-               return 1;
-       }
-
-       tx_mac_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_mac");
-       rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
-       if (tx_mac_map_fd < 0 || rxcnt_map_fd < 0) {
-               printf("bpf_object__find_map_fd_by_name failed\n");
-               return 1;
+       if (!ifindex_in || !ifindex_out) {
+               fprintf(stderr, "Bad interface index or name\n");
+               sample_usage(argv, long_options, __doc__, mask, true);
+               goto end;
        }
 
-       if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) {
-               printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
-               return 1;
+       skel = xdp_redirect_map__open();
+       if (!skel) {
+               fprintf(stderr, "Failed to xdp_redirect_map__open: %s\n",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end;
        }
 
-       ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-       if (ret) {
-               printf("can't get prog info - %s\n", strerror(errno));
-               return ret;
+       ret = sample_init_pre_load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
-       prog_id = info.id;
-
-       /* Loading dummy XDP prog on out-device */
-       if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,
-                           (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
-               printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
-               ifindex_out_xdp_dummy_attached = false;
-       }
-
-       memset(&info, 0, sizeof(info));
-       ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);
-       if (ret) {
-               printf("can't get prog info - %s\n", strerror(errno));
-               return ret;
-       }
-       dummy_prog_id = info.id;
 
        /* Load 2nd xdp prog on egress. */
        if (xdp_devmap_attached) {
-               unsigned char mac_addr[6];
-
-               devmap_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_egress");
-               if (!devmap_prog) {
-                       printf("finding devmap_prog in obj file failed\n");
-                       goto out;
-               }
-               devmap_prog_fd = bpf_program__fd(devmap_prog);
-               if (devmap_prog_fd < 0) {
-                       printf("finding devmap_prog fd failed\n");
-                       goto out;
-               }
-
-               if (get_mac_addr(ifindex_out, mac_addr) < 0) {
-                       printf("get interface %d mac failed\n", ifindex_out);
-                       goto out;
+               ret = get_mac_addr(ifindex_out, skel->rodata->tx_mac_addr);
+               if (ret < 0) {
+                       fprintf(stderr, "Failed to get interface %d mac address: %s\n",
+                               ifindex_out, strerror(-ret));
+                       ret = EXIT_FAIL;
+                       goto end_destroy;
                }
+       }
 
-               ret = bpf_map_update_elem(tx_mac_map_fd, &key, mac_addr, 0);
-               if (ret) {
-                       perror("bpf_update_elem tx_mac_map_fd");
-                       goto out;
+       skel->rodata->from_match[0] = ifindex_in;
+       skel->rodata->to_match[0] = ifindex_out;
+
+       ret = xdp_redirect_map__load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to xdp_redirect_map__load: %s\n",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
+       }
+
+       ret = sample_init(skel, mask);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
+       }
+
+       prog = skel->progs.xdp_redirect_map_native;
+       tx_port_map = skel->maps.tx_port_native;
+restart:
+       if (sample_install_xdp(prog, ifindex_in, generic, force) < 0) {
+               /* First try with struct bpf_devmap_val as value for generic
+                * mode, then fallback to sizeof(int) for older kernels.
+                */
+               fprintf(stderr,
+                       "Trying fallback to sizeof(int) as value_size for devmap in generic mode\n");
+               if (generic && !tried) {
+                       prog = skel->progs.xdp_redirect_map_general;
+                       tx_port_map = skel->maps.tx_port_general;
+                       tried = true;
+                       goto restart;
                }
+               ret = EXIT_FAIL_XDP;
+               goto end_destroy;
        }
 
-       signal(SIGINT, int_exit);
-       signal(SIGTERM, int_exit);
+       /* Loading dummy XDP prog on out-device */
+       sample_install_xdp(skel->progs.xdp_redirect_dummy_prog, ifindex_out, generic, force);
 
        devmap_val.ifindex = ifindex_out;
-       devmap_val.bpf_prog.fd = devmap_prog_fd;
-       ret = bpf_map_update_elem(tx_port_map_fd, &key, &devmap_val, 0);
-       if (ret) {
-               perror("bpf_update_elem");
-               goto out;
-       }
-
-       poll_stats(2, ifindex_out);
-
-out:
-       return 0;
+       if (xdp_devmap_attached)
+               devmap_val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_redirect_map_egress);
+       ret = bpf_map_update_elem(bpf_map__fd(tx_port_map), &key, &devmap_val, 0);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to update devmap value: %s\n",
+                       strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
+       }
+
+       ret = EXIT_FAIL;
+       if (!if_indextoname(ifindex_in, ifname_in)) {
+               fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_in,
+                       strerror(errno));
+               goto end_destroy;
+       }
+
+       if (!if_indextoname(ifindex_out, ifname_out)) {
+               fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_out,
+                       strerror(errno));
+               goto end_destroy;
+       }
+
+       safe_strncpy(str, get_driver_name(ifindex_in), sizeof(str));
+       printf("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n",
+              ifname_in, ifindex_in, str, ifname_out, ifindex_out, get_driver_name(ifindex_out));
+       snprintf(str, sizeof(str), "%s->%s", ifname_in, ifname_out);
+
+       ret = sample_run(interval, NULL, NULL);
+       if (ret < 0) {
+               fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
+       }
+       ret = EXIT_OK;
+end_destroy:
+       xdp_redirect_map__destroy(skel);
+end:
+       sample_exit(ret);
 }
index 93854e1..7af5b07 100644 (file)
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
  */
+static const char *__doc__ =
+"XDP redirect tool, using bpf_redirect helper\n"
+"Usage: xdp_redirect <IFINDEX|IFNAME>_IN <IFINDEX|IFNAME>_OUT\n";
+
 #include <linux/bpf.h>
 #include <linux/if_link.h>
 #include <assert.h>
 #include <net/if.h>
 #include <unistd.h>
 #include <libgen.h>
+#include <getopt.h>
 #include <sys/resource.h>
-
-#include "bpf_util.h"
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include "xdp_sample_user.h"
+#include "xdp_redirect.skel.h"
 
-static int ifindex_in;
-static int ifindex_out;
-static bool ifindex_out_xdp_dummy_attached = true;
-static __u32 prog_id;
-static __u32 dummy_prog_id;
-
-static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-static int rxcnt_map_fd;
-
-static void int_exit(int sig)
-{
-       __u32 curr_prog_id = 0;
-
-       if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) {
-               printf("bpf_get_link_xdp_id failed\n");
-               exit(1);
-       }
-       if (prog_id == curr_prog_id)
-               bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
-       else if (!curr_prog_id)
-               printf("couldn't find a prog id on iface IN\n");
-       else
-               printf("program on iface IN changed, not removing\n");
-
-       if (ifindex_out_xdp_dummy_attached) {
-               curr_prog_id = 0;
-               if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id,
-                                       xdp_flags)) {
-                       printf("bpf_get_link_xdp_id failed\n");
-                       exit(1);
-               }
-               if (dummy_prog_id == curr_prog_id)
-                       bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
-               else if (!curr_prog_id)
-                       printf("couldn't find a prog id on iface OUT\n");
-               else
-                       printf("program on iface OUT changed, not removing\n");
-       }
-       exit(0);
-}
-
-static void poll_stats(int interval, int ifindex)
-{
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       __u64 values[nr_cpus], prev[nr_cpus];
-
-       memset(prev, 0, sizeof(prev));
-
-       while (1) {
-               __u64 sum = 0;
-               __u32 key = 0;
-               int i;
-
-               sleep(interval);
-               assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
-               for (i = 0; i < nr_cpus; i++)
-                       sum += (values[i] - prev[i]);
-               if (sum)
-                       printf("ifindex %i: %10llu pkt/s\n",
-                              ifindex, sum / interval);
-               memcpy(prev, values, sizeof(values));
-       }
-}
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_CNT |
+                 SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI;
 
-static void usage(const char *prog)
-{
-       fprintf(stderr,
-               "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
-               "OPTS:\n"
-               "    -S    use skb-mode\n"
-               "    -N    enforce native mode\n"
-               "    -F    force loading prog\n",
-               prog);
-}
+DEFINE_SAMPLE_INIT(xdp_redirect);
 
+static const struct option long_options[] = {
+       {"help",        no_argument,            NULL, 'h' },
+       {"skb-mode",    no_argument,            NULL, 'S' },
+       {"force",       no_argument,            NULL, 'F' },
+       {"stats",       no_argument,            NULL, 's' },
+       {"interval",    required_argument,      NULL, 'i' },
+       {"verbose",     no_argument,            NULL, 'v' },
+       {}
+};
 
 int main(int argc, char **argv)
 {
-       struct bpf_prog_load_attr prog_load_attr = {
-               .prog_type      = BPF_PROG_TYPE_XDP,
-       };
-       struct bpf_program *prog, *dummy_prog;
-       int prog_fd, tx_port_map_fd, opt;
-       struct bpf_prog_info info = {};
-       __u32 info_len = sizeof(info);
-       const char *optstr = "FSN";
-       struct bpf_object *obj;
-       char filename[256];
-       int dummy_prog_fd;
-       int ret, key = 0;
-
-       while ((opt = getopt(argc, argv, optstr)) != -1) {
+       int ifindex_in, ifindex_out, opt;
+       char str[2 * IF_NAMESIZE + 1];
+       char ifname_out[IF_NAMESIZE];
+       char ifname_in[IF_NAMESIZE];
+       int ret = EXIT_FAIL_OPTION;
+       unsigned long interval = 2;
+       struct xdp_redirect *skel;
+       bool generic = false;
+       bool force = false;
+       bool error = true;
+
+       while ((opt = getopt_long(argc, argv, "hSFi:vs",
+                                 long_options, NULL)) != -1) {
                switch (opt) {
                case 'S':
-                       xdp_flags |= XDP_FLAGS_SKB_MODE;
-                       break;
-               case 'N':
-                       /* default, set below */
+                       generic = true;
+                       mask &= ~(SAMPLE_DEVMAP_XMIT_CNT |
+                                 SAMPLE_DEVMAP_XMIT_CNT_MULTI);
                        break;
                case 'F':
-                       xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+                       force = true;
+                       break;
+               case 'i':
+                       interval = strtoul(optarg, NULL, 0);
+                       break;
+               case 'v':
+                       sample_switch_mode();
+                       break;
+               case 's':
+                       mask |= SAMPLE_REDIRECT_CNT;
                        break;
+               case 'h':
+                       error = false;
                default:
-                       usage(basename(argv[0]));
-                       return 1;
+                       sample_usage(argv, long_options, __doc__, mask, error);
+                       return ret;
                }
        }
 
-       if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
-               xdp_flags |= XDP_FLAGS_DRV_MODE;
-
-       if (optind + 2 != argc) {
-               printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
-               return 1;
+       if (argc <= optind + 1) {
+               sample_usage(argv, long_options, __doc__, mask, true);
+               return ret;
        }
 
        ifindex_in = if_nametoindex(argv[optind]);
@@ -143,75 +94,80 @@ int main(int argc, char **argv)
        if (!ifindex_out)
                ifindex_out = strtoul(argv[optind + 1], NULL, 0);
 
-       printf("input: %d output: %d\n", ifindex_in, ifindex_out);
-
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-       prog_load_attr.file = filename;
-
-       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-               return 1;
-
-       prog = bpf_program__next(NULL, obj);
-       dummy_prog = bpf_program__next(prog, obj);
-       if (!prog || !dummy_prog) {
-               printf("finding a prog in obj file failed\n");
-               return 1;
+       if (!ifindex_in || !ifindex_out) {
+               fprintf(stderr, "Bad interface index or name\n");
+               sample_usage(argv, long_options, __doc__, mask, true);
+               goto end;
        }
-       /* bpf_prog_load_xattr gives us the pointer to first prog's fd,
-        * so we're missing only the fd for dummy prog
-        */
-       dummy_prog_fd = bpf_program__fd(dummy_prog);
-       if (prog_fd < 0 || dummy_prog_fd < 0) {
-               printf("bpf_prog_load_xattr: %s\n", strerror(errno));
-               return 1;
+
+       skel = xdp_redirect__open();
+       if (!skel) {
+               fprintf(stderr, "Failed to xdp_redirect__open: %s\n", strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end;
        }
 
-       tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port");
-       rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
-       if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) {
-               printf("bpf_object__find_map_fd_by_name failed\n");
-               return 1;
+       ret = sample_init_pre_load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) {
-               printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
-               return 1;
+       skel->rodata->from_match[0] = ifindex_in;
+       skel->rodata->to_match[0] = ifindex_out;
+       skel->rodata->ifindex_out = ifindex_out;
+
+       ret = xdp_redirect__load(skel);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to xdp_redirect__load: %s\n", strerror(errno));
+               ret = EXIT_FAIL_BPF;
+               goto end_destroy;
        }
 
-       ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-       if (ret) {
-               printf("can't get prog info - %s\n", strerror(errno));
-               return ret;
+       ret = sample_init(skel, mask);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
        }
-       prog_id = info.id;
+
+       ret = EXIT_FAIL_XDP;
+       if (sample_install_xdp(skel->progs.xdp_redirect_prog, ifindex_in,
+                              generic, force) < 0)
+               goto end_destroy;
 
        /* Loading dummy XDP prog on out-device */
-       if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,
-                           (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
-               printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
-               ifindex_out_xdp_dummy_attached = false;
+       sample_install_xdp(skel->progs.xdp_redirect_dummy_prog, ifindex_out,
+                          generic, force);
+
+       ret = EXIT_FAIL;
+       if (!if_indextoname(ifindex_in, ifname_in)) {
+               fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_in,
+                       strerror(errno));
+               goto end_destroy;
        }
 
-       memset(&info, 0, sizeof(info));
-       ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);
-       if (ret) {
-               printf("can't get prog info - %s\n", strerror(errno));
-               return ret;
+       if (!if_indextoname(ifindex_out, ifname_out)) {
+               fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_out,
+                       strerror(errno));
+               goto end_destroy;
        }
-       dummy_prog_id = info.id;
 
-       signal(SIGINT, int_exit);
-       signal(SIGTERM, int_exit);
+       safe_strncpy(str, get_driver_name(ifindex_in), sizeof(str));
+       printf("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n",
+              ifname_in, ifindex_in, str, ifname_out, ifindex_out, get_driver_name(ifindex_out));
+       snprintf(str, sizeof(str), "%s->%s", ifname_in, ifname_out);
 
-       /* bpf redirect port */
-       ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0);
-       if (ret) {
-               perror("bpf_update_elem");
-               goto out;
+       ret = sample_run(interval, NULL, NULL);
+       if (ret < 0) {
+               fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+               ret = EXIT_FAIL;
+               goto end_destroy;
        }
-
-       poll_stats(2, ifindex_out);
-
-out:
-       return ret;
+       ret = EXIT_OK;
+end_destroy:
+       xdp_redirect__destroy(skel);
+end:
+       sample_exit(ret);
 }
diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c
new file mode 100644 (file)
index 0000000..0eb7e1d
--- /dev/null
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */
+#include "xdp_sample.bpf.h"
+
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+
+array_map rx_cnt SEC(".maps");
+array_map redir_err_cnt SEC(".maps");
+array_map cpumap_enqueue_cnt SEC(".maps");
+array_map cpumap_kthread_cnt SEC(".maps");
+array_map exception_cnt SEC(".maps");
+array_map devmap_xmit_cnt SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+       __uint(max_entries, 32 * 32);
+       __type(key, u64);
+       __type(value, struct datarec);
+} devmap_xmit_cnt_multi SEC(".maps");
+
+const volatile int nr_cpus = 0;
+
+/* These can be set before loading so that redundant comparisons can be DCE'd by
+ * the verifier, and only actual matches are tried after loading tp_btf program.
+ * This allows sample to filter tracepoint stats based on net_device.
+ */
+const volatile int from_match[32] = {};
+const volatile int to_match[32] = {};
+
+int cpumap_map_id = 0;
+
+/* Find if b is part of set a, but if a is empty set then evaluate to true */
+#define IN_SET(a, b)                                                 \
+       ({                                                           \
+               bool __res = !(a)[0];                                \
+               for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \
+                       __res = (a)[i] == (b);                       \
+                       if (__res)                                   \
+                               break;                               \
+               }                                                    \
+               __res;                                               \
+       })
+
+static __always_inline __u32 xdp_get_err_key(int err)
+{
+       switch (err) {
+       case 0:
+               return 0;
+       case -EINVAL:
+               return 2;
+       case -ENETDOWN:
+               return 3;
+       case -EMSGSIZE:
+               return 4;
+       case -EOPNOTSUPP:
+               return 5;
+       case -ENOSPC:
+               return 6;
+       default:
+               return 1;
+       }
+}
+
+static __always_inline int xdp_redirect_collect_stat(int from, int err)
+{
+       u32 cpu = bpf_get_smp_processor_id();
+       u32 key = XDP_REDIRECT_ERROR;
+       struct datarec *rec;
+       u32 idx;
+
+       if (!IN_SET(from_match, from))
+               return 0;
+
+       key = xdp_get_err_key(err);
+
+       idx = key * nr_cpus + cpu;
+       rec = bpf_map_lookup_elem(&redir_err_cnt, &idx);
+       if (!rec)
+               return 0;
+       if (key)
+               NO_TEAR_INC(rec->dropped);
+       else
+               NO_TEAR_INC(rec->processed);
+       return 0; /* Indicate event was filtered (no further processing)*/
+       /*
+        * Returning 1 here would allow e.g. a perf-record tracepoint
+        * to see and record these events, but it doesn't work well
+        * in-practice as stopping perf-record also unload this
+        * bpf_prog.  Plus, there is additional overhead of doing so.
+        */
+}
+
+SEC("tp_btf/xdp_redirect_err")
+int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev,
+            const struct bpf_prog *xdp, const void *tgt, int err,
+            const struct bpf_map *map, u32 index)
+{
+       return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_redirect_map_err")
+int BPF_PROG(tp_xdp_redirect_map_err, const struct net_device *dev,
+            const struct bpf_prog *xdp, const void *tgt, int err,
+            const struct bpf_map *map, u32 index)
+{
+       return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_redirect")
+int BPF_PROG(tp_xdp_redirect, const struct net_device *dev,
+            const struct bpf_prog *xdp, const void *tgt, int err,
+            const struct bpf_map *map, u32 index)
+{
+       return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_redirect_map")
+int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev,
+            const struct bpf_prog *xdp, const void *tgt, int err,
+            const struct bpf_map *map, u32 index)
+{
+       return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_cpumap_enqueue")
+int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed,
+            unsigned int drops, int to_cpu)
+{
+       u32 cpu = bpf_get_smp_processor_id();
+       struct datarec *rec;
+       u32 idx;
+
+       if (cpumap_map_id && cpumap_map_id != map_id)
+               return 0;
+
+       idx = to_cpu * nr_cpus + cpu;
+       rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx);
+       if (!rec)
+               return 0;
+       NO_TEAR_ADD(rec->processed, processed);
+       NO_TEAR_ADD(rec->dropped, drops);
+       /* Record bulk events, then userspace can calc average bulk size */
+       if (processed > 0)
+               NO_TEAR_INC(rec->issue);
+       /* Inception: It's possible to detect overload situations, via
+        * this tracepoint.  This can be used for creating a feedback
+        * loop to XDP, which can take appropriate actions to mitigate
+        * this overload situation.
+        */
+       return 0;
+}
+
+SEC("tp_btf/xdp_cpumap_kthread")
+int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed,
+            unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats)
+{
+       struct datarec *rec;
+       u32 cpu;
+
+       if (cpumap_map_id && cpumap_map_id != map_id)
+               return 0;
+
+       cpu = bpf_get_smp_processor_id();
+       rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu);
+       if (!rec)
+               return 0;
+       NO_TEAR_ADD(rec->processed, processed);
+       NO_TEAR_ADD(rec->dropped, drops);
+       NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass);
+       NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop);
+       NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect);
+       /* Count times kthread yielded CPU via schedule call */
+       if (sched)
+               NO_TEAR_INC(rec->issue);
+       return 0;
+}
+
+SEC("tp_btf/xdp_exception")
+int BPF_PROG(tp_xdp_exception, const struct net_device *dev,
+            const struct bpf_prog *xdp, u32 act)
+{
+       u32 cpu = bpf_get_smp_processor_id();
+       struct datarec *rec;
+       u32 key = act, idx;
+
+       if (!IN_SET(from_match, dev->ifindex))
+               return 0;
+       if (!IN_SET(to_match, dev->ifindex))
+               return 0;
+
+       if (key > XDP_REDIRECT)
+               key = XDP_REDIRECT + 1;
+
+       idx = key * nr_cpus + cpu;
+       rec = bpf_map_lookup_elem(&exception_cnt, &idx);
+       if (!rec)
+               return 0;
+       NO_TEAR_INC(rec->dropped);
+
+       return 0;
+}
+
+SEC("tp_btf/xdp_devmap_xmit")
+int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev,
+            const struct net_device *to_dev, int sent, int drops, int err)
+{
+       struct datarec *rec;
+       int idx_in, idx_out;
+       u32 cpu;
+
+       idx_in = from_dev->ifindex;
+       idx_out = to_dev->ifindex;
+
+       if (!IN_SET(from_match, idx_in))
+               return 0;
+       if (!IN_SET(to_match, idx_out))
+               return 0;
+
+       cpu = bpf_get_smp_processor_id();
+       rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu);
+       if (!rec)
+               return 0;
+       NO_TEAR_ADD(rec->processed, sent);
+       NO_TEAR_ADD(rec->dropped, drops);
+       /* Record bulk events, then userspace can calc average bulk size */
+       NO_TEAR_INC(rec->info);
+       /* Record error cases, where no frame were sent */
+       /* Catch API error of drv ndo_xdp_xmit sent more than count */
+       if (err || drops < 0)
+               NO_TEAR_INC(rec->issue);
+       return 0;
+}
+
+SEC("tp_btf/xdp_devmap_xmit")
+int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev,
+            const struct net_device *to_dev, int sent, int drops, int err)
+{
+       struct datarec empty = {};
+       struct datarec *rec;
+       int idx_in, idx_out;
+       u64 idx;
+
+       idx_in = from_dev->ifindex;
+       idx_out = to_dev->ifindex;
+       idx = idx_in;
+       idx = idx << 32 | idx_out;
+
+       if (!IN_SET(from_match, idx_in))
+               return 0;
+       if (!IN_SET(to_match, idx_out))
+               return 0;
+
+       bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST);
+       rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx);
+       if (!rec)
+               return 0;
+
+       NO_TEAR_ADD(rec->processed, sent);
+       NO_TEAR_ADD(rec->dropped, drops);
+       NO_TEAR_INC(rec->info);
+       if (err || drops < 0)
+               NO_TEAR_INC(rec->issue);
+       return 0;
+}
diff --git a/samples/bpf/xdp_sample.bpf.h b/samples/bpf/xdp_sample.bpf.h
new file mode 100644 (file)
index 0000000..25b1dbe
--- /dev/null
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _XDP_SAMPLE_BPF_H
+#define _XDP_SAMPLE_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+
+#include "xdp_sample_shared.h"
+
+#define ETH_ALEN 6
+#define ETH_P_802_3_MIN 0x0600
+#define ETH_P_8021Q 0x8100
+#define ETH_P_8021AD 0x88A8
+#define ETH_P_IP 0x0800
+#define ETH_P_IPV6 0x86DD
+#define ETH_P_ARP 0x0806
+#define IPPROTO_ICMPV6 58
+
+#define EINVAL 22
+#define ENETDOWN 100
+#define EMSGSIZE 90
+#define EOPNOTSUPP 95
+#define ENOSPC 28
+
+typedef struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(map_flags, BPF_F_MMAPABLE);
+       __type(key, unsigned int);
+       __type(value, struct datarec);
+} array_map;
+
+extern array_map rx_cnt;
+extern const volatile int nr_cpus;
+
+enum {
+       XDP_REDIRECT_SUCCESS = 0,
+       XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline void swap_src_dst_mac(void *data)
+{
+       unsigned short *p = data;
+       unsigned short dst[3];
+
+       dst[0] = p[0];
+       dst[1] = p[1];
+       dst[2] = p[2];
+       p[0] = p[3];
+       p[1] = p[4];
+       p[2] = p[5];
+       p[3] = dst[0];
+       p[4] = dst[1];
+       p[5] = dst[2];
+}
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+       __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define bpf_ntohs(x)           __builtin_bswap16(x)
+#define bpf_htons(x)           __builtin_bswap16(x)
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+       __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define bpf_ntohs(x)           (x)
+#define bpf_htons(x)           (x)
+#else
+# error "Endianness detection needs to be set up for your compiler?!"
+#endif
+
+/*
+ * Note: including linux/compiler.h or linux/kernel.h for the macros below
+ * conflicts with vmlinux.h include in BPF files, so we define them here.
+ *
+ * Following functions are taken from kernel sources and
+ * break aliasing rules in their original form.
+ *
+ * While kernel is compiled with -fno-strict-aliasing,
+ * perf uses -Wstrict-aliasing=3 which makes build fail
+ * under gcc 4.4.
+ *
+ * Using extra __may_alias__ type to allow aliasing
+ * in this case.
+ */
+typedef __u8  __attribute__((__may_alias__))  __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+       switch (size) {
+       case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
+       case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+       case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+       case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+       default:
+               asm volatile ("" : : : "memory");
+               __builtin_memcpy((void *)res, (const void *)p, size);
+               asm volatile ("" : : : "memory");
+       }
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+       switch (size) {
+       case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
+       case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+       case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+       case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+       default:
+               asm volatile ("" : : : "memory");
+               __builtin_memcpy((void *)p, (const void *)res, size);
+               asm volatile ("" : : : "memory");
+       }
+}
+
+#define READ_ONCE(x)                                   \
+({                                                     \
+       union { typeof(x) __val; char __c[1]; } __u =   \
+               { .__c = { 0 } };                       \
+       __read_once_size(&(x), __u.__c, sizeof(x));     \
+       __u.__val;                                      \
+})
+
+#define WRITE_ONCE(x, val)                             \
+({                                                     \
+       union { typeof(x) __val; char __c[1]; } __u =   \
+               { .__val = (val) };                     \
+       __write_once_size(&(x), __u.__c, sizeof(x));    \
+       __u.__val;                                      \
+})
+
+/* Add a value using relaxed read and relaxed write. Less expensive than
+ * fetch_add when there is no write concurrency.
+ */
+#define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val))
+#define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#endif
diff --git a/samples/bpf/xdp_sample_shared.h b/samples/bpf/xdp_sample_shared.h
new file mode 100644 (file)
index 0000000..8a7669a
--- /dev/null
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef _XDP_SAMPLE_SHARED_H
+#define _XDP_SAMPLE_SHARED_H
+
+struct datarec {
+       size_t processed;
+       size_t dropped;
+       size_t issue;
+       union {
+               size_t xdp_pass;
+               size_t info;
+       };
+       size_t xdp_drop;
+       size_t xdp_redirect;
+} __attribute__((aligned(64)));
+
+#endif
diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c
new file mode 100644 (file)
index 0000000..b32d821
--- /dev/null
@@ -0,0 +1,1673 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <linux/ethtool.h>
+#include <linux/hashtable.h>
+#include <linux/if_link.h>
+#include <linux/jhash.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/sockios.h>
+#include <locale.h>
+#include <math.h>
+#include <net/if.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/signalfd.h>
+#include <sys/sysinfo.h>
+#include <sys/timerfd.h>
+#include <sys/utsname.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "bpf_util.h"
+#include "xdp_sample_user.h"
+
+#define __sample_print(fmt, cond, ...)                                         \
+       ({                                                                     \
+               if (cond)                                                      \
+                       printf(fmt, ##__VA_ARGS__);                            \
+       })
+
+#define print_always(fmt, ...) __sample_print(fmt, 1, ##__VA_ARGS__)
+#define print_default(fmt, ...)                                                \
+       __sample_print(fmt, sample_log_level & LL_DEFAULT, ##__VA_ARGS__)
+#define __print_err(err, fmt, ...)                                             \
+       ({                                                                     \
+               __sample_print(fmt, err > 0 || sample_log_level & LL_DEFAULT,  \
+                              ##__VA_ARGS__);                                 \
+               sample_err_exp = sample_err_exp ? true : err > 0;              \
+       })
+#define print_err(err, fmt, ...) __print_err(err, fmt, ##__VA_ARGS__)
+
+#define __COLUMN(x) "%'10" x " %-13s"
+#define FMT_COLUMNf __COLUMN(".0f")
+#define FMT_COLUMNd __COLUMN("d")
+#define FMT_COLUMNl __COLUMN("llu")
+#define RX(rx) rx, "rx/s"
+#define PPS(pps) pps, "pkt/s"
+#define DROP(drop) drop, "drop/s"
+#define ERR(err) err, "error/s"
+#define HITS(hits) hits, "hit/s"
+#define XMIT(xmit) xmit, "xmit/s"
+#define PASS(pass) pass, "pass/s"
+#define REDIR(redir) redir, "redir/s"
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+
+#define XDP_UNKNOWN (XDP_REDIRECT + 1)
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+#define XDP_REDIRECT_ERR_MAX 7
+
+enum map_type {
+       MAP_RX,
+       MAP_REDIRECT_ERR,
+       MAP_CPUMAP_ENQUEUE,
+       MAP_CPUMAP_KTHREAD,
+       MAP_EXCEPTION,
+       MAP_DEVMAP_XMIT,
+       MAP_DEVMAP_XMIT_MULTI,
+       NUM_MAP,
+};
+
+enum log_level {
+       LL_DEFAULT = 1U << 0,
+       LL_SIMPLE = 1U << 1,
+       LL_DEBUG = 1U << 2,
+};
+
+struct record {
+       __u64 timestamp;
+       struct datarec total;
+       struct datarec *cpu;
+};
+
+struct map_entry {
+       struct hlist_node node;
+       __u64 pair;
+       struct record val;
+};
+
+struct stats_record {
+       struct record rx_cnt;
+       struct record redir_err[XDP_REDIRECT_ERR_MAX];
+       struct record kthread;
+       struct record exception[XDP_ACTION_MAX];
+       struct record devmap_xmit;
+       DECLARE_HASHTABLE(xmit_map, 5);
+       struct record enq[];
+};
+
+struct sample_output {
+       struct {
+               __u64 rx;
+               __u64 redir;
+               __u64 drop;
+               __u64 drop_xmit;
+               __u64 err;
+               __u64 xmit;
+       } totals;
+       struct {
+               __u64 pps;
+               __u64 drop;
+               __u64 err;
+       } rx_cnt;
+       struct {
+               __u64 suc;
+               __u64 err;
+       } redir_cnt;
+       struct {
+               __u64 hits;
+       } except_cnt;
+       struct {
+               __u64 pps;
+               __u64 drop;
+               __u64 err;
+               double bavg;
+       } xmit_cnt;
+};
+
+struct xdp_desc {
+       int ifindex;
+       __u32 prog_id;
+       int flags;
+} sample_xdp_progs[32];
+
+struct datarec *sample_mmap[NUM_MAP];
+struct bpf_map *sample_map[NUM_MAP];
+size_t sample_map_count[NUM_MAP];
+enum log_level sample_log_level;
+struct sample_output sample_out;
+unsigned long sample_interval;
+bool sample_err_exp;
+int sample_xdp_cnt;
+int sample_n_cpus;
+int sample_sig_fd;
+int sample_mask;
+
+static const char *xdp_redirect_err_names[XDP_REDIRECT_ERR_MAX] = {
+       /* Key=1 keeps unknown errors */
+       "Success",
+       "Unknown",
+       "EINVAL",
+       "ENETDOWN",
+       "EMSGSIZE",
+       "EOPNOTSUPP",
+       "ENOSPC",
+};
+
+/* Keyed from Unknown */
+static const char *xdp_redirect_err_help[XDP_REDIRECT_ERR_MAX - 1] = {
+       "Unknown error",
+       "Invalid redirection",
+       "Device being redirected to is down",
+       "Packet length too large for device",
+       "Operation not supported",
+       "No space in ptr_ring of cpumap kthread",
+};
+
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+       [XDP_ABORTED]  = "XDP_ABORTED",
+       [XDP_DROP]     = "XDP_DROP",
+       [XDP_PASS]     = "XDP_PASS",
+       [XDP_TX]       = "XDP_TX",
+       [XDP_REDIRECT] = "XDP_REDIRECT",
+       [XDP_UNKNOWN]  = "XDP_UNKNOWN",
+};
+
+static __u64 gettime(void)
+{
+       struct timespec t;
+       int res;
+
+       res = clock_gettime(CLOCK_MONOTONIC, &t);
+       if (res < 0) {
+               fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+               return UINT64_MAX;
+       }
+       return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+static const char *action2str(int action)
+{
+       if (action < XDP_ACTION_MAX)
+               return xdp_action_names[action];
+       return NULL;
+}
+
+static void sample_print_help(int mask)
+{
+       printf("Output format description\n\n"
+              "By default, redirect success statistics are disabled, use -s to enable.\n"
+              "The terse output mode is default, verbose mode can be activated using -v\n"
+              "Use SIGQUIT (Ctrl + \\) to switch the mode dynamically at runtime\n\n"
+              "Terse mode displays at most the following fields:\n"
+              "  rx/s        Number of packets received per second\n"
+              "  redir/s     Number of packets successfully redirected per second\n"
+              "  err,drop/s  Aggregated count of errors per second (including dropped packets)\n"
+              "  xmit/s      Number of packets transmitted on the output device per second\n\n"
+              "Output description for verbose mode:\n"
+              "  FIELD                 DESCRIPTION\n");
+
+       if (mask & SAMPLE_RX_CNT) {
+               printf("  receive\t\tDisplays the number of packets received & errors encountered\n"
+                      " \t\t\tWhenever an error or packet drop occurs, details of per CPU error\n"
+                      " \t\t\tand drop statistics will be expanded inline in terse mode.\n"
+                      " \t\t\t\tpkt/s     - Packets received per second\n"
+                      " \t\t\t\tdrop/s    - Packets dropped per second\n"
+                      " \t\t\t\terror/s   - Errors encountered per second\n\n");
+       }
+       if (mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) {
+               printf("  redirect\t\tDisplays the number of packets successfully redirected\n"
+                      "  \t\t\tErrors encountered are expanded under redirect_err field\n"
+                      "  \t\t\tNote that passing -s to enable it has a per packet overhead\n"
+                      "  \t\t\t\tredir/s   - Packets redirected successfully per second\n\n"
+                      "  redirect_err\t\tDisplays the number of packets that failed redirection\n"
+                      "  \t\t\tThe errno is expanded under this field with per CPU count\n"
+                      "  \t\t\tThe recognized errors are:\n");
+
+               for (int i = 2; i < XDP_REDIRECT_ERR_MAX; i++)
+                       printf("\t\t\t  %s: %s\n", xdp_redirect_err_names[i],
+                              xdp_redirect_err_help[i - 1]);
+
+               printf("  \n\t\t\t\terror/s   - Packets that failed redirection per second\n\n");
+       }
+
+       if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) {
+               printf("  enqueue to cpu N\tDisplays the number of packets enqueued to bulk queue of CPU N\n"
+                      "  \t\t\tExpands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N\n"
+                      "  \t\t\tReceived packets can be associated with the CPU redirect program is enqueuing \n"
+                      "  \t\t\tpackets to.\n"
+                      "  \t\t\t\tpkt/s    - Packets enqueued per second from other CPU to CPU N\n"
+                      "  \t\t\t\tdrop/s   - Packets dropped when trying to enqueue to CPU N\n"
+                      "  \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n");
+       }
+
+       if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) {
+               printf("  kthread\t\tDisplays the number of packets processed in CPUMAP kthread for each CPU\n"
+                      "  \t\t\tPackets consumed from ptr_ring in kthread, and its xdp_stats (after calling \n"
+                      "  \t\t\tCPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and\n"
+                      "  \t\t\tthen per-CPU to associate it to each CPU's pinned CPUMAP kthread.\n"
+                      "  \t\t\t\tpkt/s    - Packets consumed per second from ptr_ring\n"
+                      "  \t\t\t\tdrop/s   - Packets dropped per second in kthread\n"
+                      "  \t\t\t\tsched    - Number of times kthread called schedule()\n\n"
+                      "  \t\t\txdp_stats (also expands to per-CPU counts)\n"
+                      "  \t\t\t\tpass/s  - XDP_PASS count for CPUMAP program execution\n"
+                      "  \t\t\t\tdrop/s  - XDP_DROP count for CPUMAP program execution\n"
+                      "  \t\t\t\tredir/s - XDP_REDIRECT count for CPUMAP program execution\n\n");
+       }
+
+       if (mask & SAMPLE_EXCEPTION_CNT) {
+               printf("  xdp_exception\t\tDisplays xdp_exception tracepoint events\n"
+                      "  \t\t\tThis can occur due to internal driver errors, unrecognized\n"
+                      "  \t\t\tXDP actions and due to explicit user trigger by use of XDP_ABORTED\n"
+                      "  \t\t\tEach action is expanded below this field with its count\n"
+                      "  \t\t\t\thit/s     - Number of times the tracepoint was hit per second\n\n");
+       }
+
+       if (mask & SAMPLE_DEVMAP_XMIT_CNT) {
+               printf("  devmap_xmit\t\tDisplays devmap_xmit tracepoint events\n"
+                      "  \t\t\tThis tracepoint is invoked for successful transmissions on output\n"
+                      "  \t\t\tdevice but these statistics are not available for generic XDP mode,\n"
+                      "  \t\t\thence they will be omitted from the output when using SKB mode\n"
+                      "  \t\t\t\txmit/s    - Number of packets that were transmitted per second\n"
+                      "  \t\t\t\tdrop/s    - Number of packets that failed transmissions per second\n"
+                      "  \t\t\t\tdrv_err/s - Number of internal driver errors per second\n"
+                      "  \t\t\t\tbulk-avg  - Average number of packets processed for each event\n\n");
+       }
+}
+
+void sample_usage(char *argv[], const struct option *long_options,
+                 const char *doc, int mask, bool error)
+{
+       int i;
+
+       if (!error)
+               sample_print_help(mask);
+
+       printf("\n%s\nOption for %s:\n", doc, argv[0]);
+       for (i = 0; long_options[i].name != 0; i++) {
+               printf(" --%-15s", long_options[i].name);
+               if (long_options[i].flag != NULL)
+                       printf(" flag (internal value: %d)",
+                              *long_options[i].flag);
+               else
+                       printf("\t short-option: -%c", long_options[i].val);
+               printf("\n");
+       }
+       printf("\n");
+}
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+       unsigned int nr_cpus = libbpf_num_possible_cpus();
+       struct datarec *array;
+
+       array = calloc(nr_cpus, sizeof(*array));
+       if (!array) {
+               fprintf(stderr, "Failed to allocate memory (nr_cpus: %u)\n",
+                       nr_cpus);
+               return NULL;
+       }
+       return array;
+}
+
+static int map_entry_init(struct map_entry *e, __u64 pair)
+{
+       e->pair = pair;
+       INIT_HLIST_NODE(&e->node);
+       e->val.timestamp = gettime();
+       e->val.cpu = alloc_record_per_cpu();
+       if (!e->val.cpu)
+               return -ENOMEM;
+       return 0;
+}
+
+static void map_collect_percpu(struct datarec *values, struct record *rec)
+{
+       /* For percpu maps, userspace gets a value per possible CPU */
+       unsigned int nr_cpus = libbpf_num_possible_cpus();
+       __u64 sum_xdp_redirect = 0;
+       __u64 sum_processed = 0;
+       __u64 sum_xdp_pass = 0;
+       __u64 sum_xdp_drop = 0;
+       __u64 sum_dropped = 0;
+       __u64 sum_issue = 0;
+       int i;
+
+       /* Get time as close as possible to reading map contents */
+       rec->timestamp = gettime();
+
+       /* Record and sum values from each CPU */
+       for (i = 0; i < nr_cpus; i++) {
+               rec->cpu[i].processed = READ_ONCE(values[i].processed);
+               rec->cpu[i].dropped = READ_ONCE(values[i].dropped);
+               rec->cpu[i].issue = READ_ONCE(values[i].issue);
+               rec->cpu[i].xdp_pass = READ_ONCE(values[i].xdp_pass);
+               rec->cpu[i].xdp_drop = READ_ONCE(values[i].xdp_drop);
+               rec->cpu[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect);
+
+               sum_processed += rec->cpu[i].processed;
+               sum_dropped += rec->cpu[i].dropped;
+               sum_issue += rec->cpu[i].issue;
+               sum_xdp_pass += rec->cpu[i].xdp_pass;
+               sum_xdp_drop += rec->cpu[i].xdp_drop;
+               sum_xdp_redirect += rec->cpu[i].xdp_redirect;
+       }
+
+       rec->total.processed = sum_processed;
+       rec->total.dropped = sum_dropped;
+       rec->total.issue = sum_issue;
+       rec->total.xdp_pass = sum_xdp_pass;
+       rec->total.xdp_drop = sum_xdp_drop;
+       rec->total.xdp_redirect = sum_xdp_redirect;
+}
+
+static int map_collect_percpu_devmap(int map_fd, struct stats_record *rec)
+{
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       __u32 batch, count = 32;
+       struct datarec *values;
+       bool init = false;
+       __u64 *keys;
+       int i, ret;
+
+       keys = calloc(count, sizeof(__u64));
+       if (!keys)
+               return -ENOMEM;
+       values = calloc(count * nr_cpus, sizeof(struct datarec));
+       if (!values) {
+               free(keys);
+               return -ENOMEM;
+       }
+
+       for (;;) {
+               bool exit = false;
+
+               ret = bpf_map_lookup_batch(map_fd, init ? &batch : NULL, &batch,
+                                          keys, values, &count, NULL);
+               if (ret < 0 && errno != ENOENT)
+                       break;
+               if (errno == ENOENT)
+                       exit = true;
+
+               init = true;
+               for (i = 0; i < count; i++) {
+                       struct map_entry *e, *x = NULL;
+                       __u64 pair = keys[i];
+                       struct datarec *arr;
+
+                       arr = &values[i * nr_cpus];
+                       hash_for_each_possible(rec->xmit_map, e, node, pair) {
+                               if (e->pair == pair) {
+                                       x = e;
+                                       break;
+                               }
+                       }
+                       if (!x) {
+                               x = calloc(1, sizeof(*x));
+                               if (!x)
+                                       goto cleanup;
+                               if (map_entry_init(x, pair) < 0) {
+                                       free(x);
+                                       goto cleanup;
+                               }
+                               hash_add(rec->xmit_map, &x->node, pair);
+                       }
+                       map_collect_percpu(arr, &x->val);
+               }
+
+               if (exit)
+                       break;
+               count = 32;
+       }
+
+       free(values);
+       free(keys);
+       return 0;
+cleanup:
+       free(values);
+       free(keys);
+       return -ENOMEM;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+       struct stats_record *rec;
+       int i;
+
+       rec = calloc(1, sizeof(*rec) + sample_n_cpus * sizeof(struct record));
+       if (!rec) {
+               fprintf(stderr, "Failed to allocate memory\n");
+               return NULL;
+       }
+
+       if (sample_mask & SAMPLE_RX_CNT) {
+               rec->rx_cnt.cpu = alloc_record_per_cpu();
+               if (!rec->rx_cnt.cpu) {
+                       fprintf(stderr,
+                               "Failed to allocate rx_cnt per-CPU array\n");
+                       goto end_rec;
+               }
+       }
+       if (sample_mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) {
+               for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) {
+                       rec->redir_err[i].cpu = alloc_record_per_cpu();
+                       if (!rec->redir_err[i].cpu) {
+                               fprintf(stderr,
+                                       "Failed to allocate redir_err per-CPU array for "
+                                       "\"%s\" case\n",
+                                       xdp_redirect_err_names[i]);
+                               while (i--)
+                                       free(rec->redir_err[i].cpu);
+                               goto end_rx_cnt;
+                       }
+               }
+       }
+       if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) {
+               rec->kthread.cpu = alloc_record_per_cpu();
+               if (!rec->kthread.cpu) {
+                       fprintf(stderr,
+                               "Failed to allocate kthread per-CPU array\n");
+                       goto end_redir;
+               }
+       }
+       if (sample_mask & SAMPLE_EXCEPTION_CNT) {
+               for (i = 0; i < XDP_ACTION_MAX; i++) {
+                       rec->exception[i].cpu = alloc_record_per_cpu();
+                       if (!rec->exception[i].cpu) {
+                               fprintf(stderr,
+                                       "Failed to allocate exception per-CPU array for "
+                                       "\"%s\" case\n",
+                                       action2str(i));
+                               while (i--)
+                                       free(rec->exception[i].cpu);
+                               goto end_kthread;
+                       }
+               }
+       }
+       if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) {
+               rec->devmap_xmit.cpu = alloc_record_per_cpu();
+               if (!rec->devmap_xmit.cpu) {
+                       fprintf(stderr,
+                               "Failed to allocate devmap_xmit per-CPU array\n");
+                       goto end_exception;
+               }
+       }
+       if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+               hash_init(rec->xmit_map);
+       if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) {
+               for (i = 0; i < sample_n_cpus; i++) {
+                       rec->enq[i].cpu = alloc_record_per_cpu();
+                       if (!rec->enq[i].cpu) {
+                               fprintf(stderr,
+                                       "Failed to allocate enqueue per-CPU array for "
+                                       "CPU %d\n",
+                                       i);
+                               while (i--)
+                                       free(rec->enq[i].cpu);
+                               goto end_devmap_xmit;
+                       }
+               }
+       }
+
+       return rec;
+
+end_devmap_xmit:
+       free(rec->devmap_xmit.cpu);
+end_exception:
+       for (i = 0; i < XDP_ACTION_MAX; i++)
+               free(rec->exception[i].cpu);
+end_kthread:
+       free(rec->kthread.cpu);
+end_redir:
+       for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++)
+               free(rec->redir_err[i].cpu);
+end_rx_cnt:
+       free(rec->rx_cnt.cpu);
+end_rec:
+       free(rec);
+       return NULL;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+       struct hlist_node *tmp;
+       struct map_entry *e;
+       int i;
+
+       for (i = 0; i < sample_n_cpus; i++)
+               free(r->enq[i].cpu);
+       hash_for_each_safe(r->xmit_map, i, tmp, e, node) {
+               hash_del(&e->node);
+               free(e->val.cpu);
+               free(e);
+       }
+       free(r->devmap_xmit.cpu);
+       for (i = 0; i < XDP_ACTION_MAX; i++)
+               free(r->exception[i].cpu);
+       free(r->kthread.cpu);
+       for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++)
+               free(r->redir_err[i].cpu);
+       free(r->rx_cnt.cpu);
+       free(r);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+       double period_ = 0;
+       __u64 period = 0;
+
+       period = r->timestamp - p->timestamp;
+       if (period > 0)
+               period_ = ((double)period / NANOSEC_PER_SEC);
+
+       return period_;
+}
+
+static double sample_round(double val)
+{
+       if (val - floor(val) < 0.5)
+               return floor(val);
+       return ceil(val);
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+       __u64 packets = 0;
+       __u64 pps = 0;
+
+       if (period_ > 0) {
+               packets = r->processed - p->processed;
+               pps = sample_round(packets / period_);
+       }
+       return pps;
+}
+
+static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
+{
+       __u64 packets = 0;
+       __u64 pps = 0;
+
+       if (period_ > 0) {
+               packets = r->dropped - p->dropped;
+               pps = sample_round(packets / period_);
+       }
+       return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r, struct datarec *p, double period_)
+{
+       __u64 packets = 0;
+       __u64 pps = 0;
+
+       if (period_ > 0) {
+               packets = r->issue - p->issue;
+               pps = sample_round(packets / period_);
+       }
+       return pps;
+}
+
+static __u64 calc_info_pps(struct datarec *r, struct datarec *p, double period_)
+{
+       __u64 packets = 0;
+       __u64 pps = 0;
+
+       if (period_ > 0) {
+               packets = r->info - p->info;
+               pps = sample_round(packets / period_);
+       }
+       return pps;
+}
+
+static void calc_xdp_pps(struct datarec *r, struct datarec *p, double *xdp_pass,
+                        double *xdp_drop, double *xdp_redirect, double period_)
+{
+       *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0;
+       if (period_ > 0) {
+               *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_;
+               *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_;
+               *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_;
+       }
+}
+
+static void stats_get_rx_cnt(struct stats_record *stats_rec,
+                            struct stats_record *stats_prev,
+                            unsigned int nr_cpus, struct sample_output *out)
+{
+       struct record *rec, *prev;
+       double t, pps, drop, err;
+       int i;
+
+       rec = &stats_rec->rx_cnt;
+       prev = &stats_prev->rx_cnt;
+       t = calc_period(rec, prev);
+
+       for (i = 0; i < nr_cpus; i++) {
+               struct datarec *r = &rec->cpu[i];
+               struct datarec *p = &prev->cpu[i];
+               char str[64];
+
+               pps = calc_pps(r, p, t);
+               drop = calc_drop_pps(r, p, t);
+               err = calc_errs_pps(r, p, t);
+               if (!pps && !drop && !err)
+                       continue;
+
+               snprintf(str, sizeof(str), "cpu:%d", i);
+               print_default("    %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+                             "\n",
+                             str, PPS(pps), DROP(drop), ERR(err));
+       }
+
+       if (out) {
+               pps = calc_pps(&rec->total, &prev->total, t);
+               drop = calc_drop_pps(&rec->total, &prev->total, t);
+               err = calc_errs_pps(&rec->total, &prev->total, t);
+
+               out->rx_cnt.pps = pps;
+               out->rx_cnt.drop = drop;
+               out->rx_cnt.err = err;
+               out->totals.rx += pps;
+               out->totals.drop += drop;
+               out->totals.err += err;
+       }
+}
+
+static void stats_get_cpumap_enqueue(struct stats_record *stats_rec,
+                                    struct stats_record *stats_prev,
+                                    unsigned int nr_cpus)
+{
+       struct record *rec, *prev;
+       double t, pps, drop, err;
+       int i, to_cpu;
+
+       /* cpumap enqueue stats */
+       for (to_cpu = 0; to_cpu < sample_n_cpus; to_cpu++) {
+               rec = &stats_rec->enq[to_cpu];
+               prev = &stats_prev->enq[to_cpu];
+               t = calc_period(rec, prev);
+
+               pps = calc_pps(&rec->total, &prev->total, t);
+               drop = calc_drop_pps(&rec->total, &prev->total, t);
+               err = calc_errs_pps(&rec->total, &prev->total, t);
+
+               if (pps > 0 || drop > 0) {
+                       char str[64];
+
+                       snprintf(str, sizeof(str), "enqueue to cpu %d", to_cpu);
+
+                       if (err > 0)
+                               err = pps / err; /* calc average bulk size */
+
+                       print_err(drop,
+                                 "  %-20s " FMT_COLUMNf FMT_COLUMNf __COLUMN(
+                                         ".2f") "\n",
+                                 str, PPS(pps), DROP(drop), err, "bulk-avg");
+               }
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+                       char str[64];
+
+                       pps = calc_pps(r, p, t);
+                       drop = calc_drop_pps(r, p, t);
+                       err = calc_errs_pps(r, p, t);
+                       if (!pps && !drop && !err)
+                               continue;
+
+                       snprintf(str, sizeof(str), "cpu:%d->%d", i, to_cpu);
+                       if (err > 0)
+                               err = pps / err; /* calc average bulk size */
+                       print_default(
+                               "    %-18s " FMT_COLUMNf FMT_COLUMNf __COLUMN(
+                                       ".2f") "\n",
+                               str, PPS(pps), DROP(drop), err, "bulk-avg");
+               }
+       }
+}
+
+static void stats_get_cpumap_remote(struct stats_record *stats_rec,
+                                   struct stats_record *stats_prev,
+                                   unsigned int nr_cpus)
+{
+       double xdp_pass, xdp_drop, xdp_redirect;
+       struct record *rec, *prev;
+       double t;
+       int i;
+
+       rec = &stats_rec->kthread;
+       prev = &stats_prev->kthread;
+       t = calc_period(rec, prev);
+
+       calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop,
+                    &xdp_redirect, t);
+       if (xdp_pass || xdp_drop || xdp_redirect) {
+               print_err(xdp_drop,
+                         "    %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n",
+                         "xdp_stats", PASS(xdp_pass), DROP(xdp_drop),
+                         REDIR(xdp_redirect));
+       }
+
+       for (i = 0; i < nr_cpus; i++) {
+               struct datarec *r = &rec->cpu[i];
+               struct datarec *p = &prev->cpu[i];
+               char str[64];
+
+               calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, &xdp_redirect, t);
+               if (!xdp_pass && !xdp_drop && !xdp_redirect)
+                       continue;
+
+               snprintf(str, sizeof(str), "cpu:%d", i);
+               print_default("      %-16s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+                             "\n",
+                             str, PASS(xdp_pass), DROP(xdp_drop),
+                             REDIR(xdp_redirect));
+       }
+}
+
+static void stats_get_cpumap_kthread(struct stats_record *stats_rec,
+                                    struct stats_record *stats_prev,
+                                    unsigned int nr_cpus)
+{
+       struct record *rec, *prev;
+       double t, pps, drop, err;
+       int i;
+
+       rec = &stats_rec->kthread;
+       prev = &stats_prev->kthread;
+       t = calc_period(rec, prev);
+
+       pps = calc_pps(&rec->total, &prev->total, t);
+       drop = calc_drop_pps(&rec->total, &prev->total, t);
+       err = calc_errs_pps(&rec->total, &prev->total, t);
+
+       print_err(drop, "  %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n",
+                 pps ? "kthread total" : "kthread", PPS(pps), DROP(drop), err,
+                 "sched");
+
+       for (i = 0; i < nr_cpus; i++) {
+               struct datarec *r = &rec->cpu[i];
+               struct datarec *p = &prev->cpu[i];
+               char str[64];
+
+               pps = calc_pps(r, p, t);
+               drop = calc_drop_pps(r, p, t);
+               err = calc_errs_pps(r, p, t);
+               if (!pps && !drop && !err)
+                       continue;
+
+               snprintf(str, sizeof(str), "cpu:%d", i);
+               print_default("    %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+                             "\n",
+                             str, PPS(pps), DROP(drop), err, "sched");
+       }
+}
+
+static void stats_get_redirect_cnt(struct stats_record *stats_rec,
+                                  struct stats_record *stats_prev,
+                                  unsigned int nr_cpus,
+                                  struct sample_output *out)
+{
+       struct record *rec, *prev;
+       double t, pps;
+       int i;
+
+       rec = &stats_rec->redir_err[0];
+       prev = &stats_prev->redir_err[0];
+       t = calc_period(rec, prev);
+       for (i = 0; i < nr_cpus; i++) {
+               struct datarec *r = &rec->cpu[i];
+               struct datarec *p = &prev->cpu[i];
+               char str[64];
+
+               pps = calc_pps(r, p, t);
+               if (!pps)
+                       continue;
+
+               snprintf(str, sizeof(str), "cpu:%d", i);
+               print_default("    %-18s " FMT_COLUMNf "\n", str, REDIR(pps));
+       }
+
+       if (out) {
+               pps = calc_pps(&rec->total, &prev->total, t);
+               out->redir_cnt.suc = pps;
+               out->totals.redir += pps;
+       }
+}
+
+static void stats_get_redirect_err_cnt(struct stats_record *stats_rec,
+                                      struct stats_record *stats_prev,
+                                      unsigned int nr_cpus,
+                                      struct sample_output *out)
+{
+       struct record *rec, *prev;
+       double t, drop, sum = 0;
+       int rec_i, i;
+
+       for (rec_i = 1; rec_i < XDP_REDIRECT_ERR_MAX; rec_i++) {
+               char str[64];
+
+               rec = &stats_rec->redir_err[rec_i];
+               prev = &stats_prev->redir_err[rec_i];
+               t = calc_period(rec, prev);
+
+               drop = calc_drop_pps(&rec->total, &prev->total, t);
+               if (drop > 0 && !out) {
+                       snprintf(str, sizeof(str),
+                                sample_log_level & LL_DEFAULT ? "%s total" :
+                                                                      "%s",
+                                xdp_redirect_err_names[rec_i]);
+                       print_err(drop, "    %-18s " FMT_COLUMNf "\n", str,
+                                 ERR(drop));
+               }
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+                       double drop;
+
+                       drop = calc_drop_pps(r, p, t);
+                       if (!drop)
+                               continue;
+
+                       snprintf(str, sizeof(str), "cpu:%d", i);
+                       print_default("       %-16s" FMT_COLUMNf "\n", str,
+                                     ERR(drop));
+               }
+
+               sum += drop;
+       }
+
+       if (out) {
+               out->redir_cnt.err = sum;
+               out->totals.err += sum;
+       }
+}
+
+static void stats_get_exception_cnt(struct stats_record *stats_rec,
+                                   struct stats_record *stats_prev,
+                                   unsigned int nr_cpus,
+                                   struct sample_output *out)
+{
+       double t, drop, sum = 0;
+       struct record *rec, *prev;
+       int rec_i, i;
+
+       for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+               rec = &stats_rec->exception[rec_i];
+               prev = &stats_prev->exception[rec_i];
+               t = calc_period(rec, prev);
+
+               drop = calc_drop_pps(&rec->total, &prev->total, t);
+               /* Fold out errors after heading */
+               sum += drop;
+
+               if (drop > 0 && !out) {
+                       print_always("    %-18s " FMT_COLUMNf "\n",
+                                    action2str(rec_i), ERR(drop));
+
+                       for (i = 0; i < nr_cpus; i++) {
+                               struct datarec *r = &rec->cpu[i];
+                               struct datarec *p = &prev->cpu[i];
+                               char str[64];
+                               double drop;
+
+                               drop = calc_drop_pps(r, p, t);
+                               if (!drop)
+                                       continue;
+
+                               snprintf(str, sizeof(str), "cpu:%d", i);
+                               print_default("       %-16s" FMT_COLUMNf "\n",
+                                             str, ERR(drop));
+                       }
+               }
+       }
+
+       if (out) {
+               out->except_cnt.hits = sum;
+               out->totals.err += sum;
+       }
+}
+
+static void stats_get_devmap_xmit(struct stats_record *stats_rec,
+                                 struct stats_record *stats_prev,
+                                 unsigned int nr_cpus,
+                                 struct sample_output *out)
+{
+       double pps, drop, info, err;
+       struct record *rec, *prev;
+       double t;
+       int i;
+
+       rec = &stats_rec->devmap_xmit;
+       prev = &stats_prev->devmap_xmit;
+       t = calc_period(rec, prev);
+       for (i = 0; i < nr_cpus; i++) {
+               struct datarec *r = &rec->cpu[i];
+               struct datarec *p = &prev->cpu[i];
+               char str[64];
+
+               pps = calc_pps(r, p, t);
+               drop = calc_drop_pps(r, p, t);
+               err = calc_errs_pps(r, p, t);
+
+               if (!pps && !drop && !err)
+                       continue;
+
+               snprintf(str, sizeof(str), "cpu:%d", i);
+               info = calc_info_pps(r, p, t);
+               if (info > 0)
+                       info = (pps + drop) / info; /* calc avg bulk */
+               print_default("     %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+                                     __COLUMN(".2f") "\n",
+                             str, XMIT(pps), DROP(drop), err, "drv_err/s",
+                             info, "bulk-avg");
+       }
+       if (out) {
+               pps = calc_pps(&rec->total, &prev->total, t);
+               drop = calc_drop_pps(&rec->total, &prev->total, t);
+               info = calc_info_pps(&rec->total, &prev->total, t);
+               if (info > 0)
+                       info = (pps + drop) / info; /* calc avg bulk */
+               err = calc_errs_pps(&rec->total, &prev->total, t);
+
+               out->xmit_cnt.pps = pps;
+               out->xmit_cnt.drop = drop;
+               out->xmit_cnt.bavg = info;
+               out->xmit_cnt.err = err;
+               out->totals.xmit += pps;
+               out->totals.drop_xmit += drop;
+               out->totals.err += err;
+       }
+}
+
+static void stats_get_devmap_xmit_multi(struct stats_record *stats_rec,
+                                       struct stats_record *stats_prev,
+                                       unsigned int nr_cpus,
+                                       struct sample_output *out,
+                                       bool xmit_total)
+{
+       double pps, drop, info, err;
+       struct map_entry *entry;
+       struct record *r, *p;
+       double t;
+       int bkt;
+
+       hash_for_each(stats_rec->xmit_map, bkt, entry, node) {
+               struct map_entry *e, *x = NULL;
+               char ifname_from[IFNAMSIZ];
+               char ifname_to[IFNAMSIZ];
+               const char *fstr, *tstr;
+               unsigned long prev_time;
+               struct record beg = {};
+               __u32 from_idx, to_idx;
+               char str[128];
+               __u64 pair;
+               int i;
+
+               prev_time = sample_interval * NANOSEC_PER_SEC;
+
+               pair = entry->pair;
+               from_idx = pair >> 32;
+               to_idx = pair & 0xFFFFFFFF;
+
+               r = &entry->val;
+               beg.timestamp = r->timestamp - prev_time;
+
+               /* Find matching entry from stats_prev map */
+               hash_for_each_possible(stats_prev->xmit_map, e, node, pair) {
+                       if (e->pair == pair) {
+                               x = e;
+                               break;
+                       }
+               }
+               if (x)
+                       p = &x->val;
+               else
+                       p = &beg;
+               t = calc_period(r, p);
+               pps = calc_pps(&r->total, &p->total, t);
+               drop = calc_drop_pps(&r->total, &p->total, t);
+               info = calc_info_pps(&r->total, &p->total, t);
+               if (info > 0)
+                       info = (pps + drop) / info; /* calc avg bulk */
+               err = calc_errs_pps(&r->total, &p->total, t);
+
+               if (out) {
+                       /* We are responsible for filling out totals */
+                       out->totals.xmit += pps;
+                       out->totals.drop_xmit += drop;
+                       out->totals.err += err;
+                       continue;
+               }
+
+               fstr = tstr = NULL;
+               if (if_indextoname(from_idx, ifname_from))
+                       fstr = ifname_from;
+               if (if_indextoname(to_idx, ifname_to))
+                       tstr = ifname_to;
+
+               snprintf(str, sizeof(str), "xmit %s->%s", fstr ?: "?",
+                        tstr ?: "?");
+               /* Skip idle streams of redirection */
+               if (pps || drop || err) {
+                       print_err(drop,
+                                 "  %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+                                 __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop),
+                                 err, "drv_err/s", info, "bulk-avg");
+               }
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *rc = &r->cpu[i];
+                       struct datarec *pc, p_beg = {};
+                       char str[64];
+
+                       pc = p == &beg ? &p_beg : &p->cpu[i];
+
+                       pps = calc_pps(rc, pc, t);
+                       drop = calc_drop_pps(rc, pc, t);
+                       err = calc_errs_pps(rc, pc, t);
+
+                       if (!pps && !drop && !err)
+                               continue;
+
+                       snprintf(str, sizeof(str), "cpu:%d", i);
+                       info = calc_info_pps(rc, pc, t);
+                       if (info > 0)
+                               info = (pps + drop) / info; /* calc avg bulk */
+
+                       print_default("     %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+                                     __COLUMN(".2f") "\n", str, XMIT(pps),
+                                     DROP(drop), err, "drv_err/s", info, "bulk-avg");
+               }
+       }
+}
+
+static void stats_print(const char *prefix, int mask, struct stats_record *r,
+                       struct stats_record *p, struct sample_output *out)
+{
+       int nr_cpus = libbpf_num_possible_cpus();
+       const char *str;
+
+       print_always("%-23s", prefix ?: "Summary");
+       if (mask & SAMPLE_RX_CNT)
+               print_always(FMT_COLUMNl, RX(out->totals.rx));
+       if (mask & SAMPLE_REDIRECT_CNT)
+               print_always(FMT_COLUMNl, REDIR(out->totals.redir));
+       printf(FMT_COLUMNl,
+              out->totals.err + out->totals.drop + out->totals.drop_xmit,
+              "err,drop/s");
+       if (mask & SAMPLE_DEVMAP_XMIT_CNT ||
+           mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+               printf(FMT_COLUMNl, XMIT(out->totals.xmit));
+       printf("\n");
+
+       if (mask & SAMPLE_RX_CNT) {
+               str = (sample_log_level & LL_DEFAULT) && out->rx_cnt.pps ?
+                                   "receive total" :
+                                   "receive";
+               print_err((out->rx_cnt.err || out->rx_cnt.drop),
+                         "  %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl "\n",
+                         str, PPS(out->rx_cnt.pps), DROP(out->rx_cnt.drop),
+                         ERR(out->rx_cnt.err));
+
+               stats_get_rx_cnt(r, p, nr_cpus, NULL);
+       }
+
+       if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT)
+               stats_get_cpumap_enqueue(r, p, nr_cpus);
+
+       if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) {
+               stats_get_cpumap_kthread(r, p, nr_cpus);
+               stats_get_cpumap_remote(r, p, nr_cpus);
+       }
+
+       if (mask & SAMPLE_REDIRECT_CNT) {
+               str = out->redir_cnt.suc ? "redirect total" : "redirect";
+               print_default("  %-20s " FMT_COLUMNl "\n", str,
+                             REDIR(out->redir_cnt.suc));
+
+               stats_get_redirect_cnt(r, p, nr_cpus, NULL);
+       }
+
+       if (mask & SAMPLE_REDIRECT_ERR_CNT) {
+               str = (sample_log_level & LL_DEFAULT) && out->redir_cnt.err ?
+                                   "redirect_err total" :
+                                   "redirect_err";
+               print_err(out->redir_cnt.err, "  %-20s " FMT_COLUMNl "\n", str,
+                         ERR(out->redir_cnt.err));
+
+               stats_get_redirect_err_cnt(r, p, nr_cpus, NULL);
+       }
+
+       if (mask & SAMPLE_EXCEPTION_CNT) {
+               str = out->except_cnt.hits ? "xdp_exception total" :
+                                                  "xdp_exception";
+
+               print_err(out->except_cnt.hits, "  %-20s " FMT_COLUMNl "\n", str,
+                         HITS(out->except_cnt.hits));
+
+               stats_get_exception_cnt(r, p, nr_cpus, NULL);
+       }
+
+       if (mask & SAMPLE_DEVMAP_XMIT_CNT) {
+               str = (sample_log_level & LL_DEFAULT) && out->xmit_cnt.pps ?
+                                   "devmap_xmit total" :
+                                   "devmap_xmit";
+
+               print_err(out->xmit_cnt.err || out->xmit_cnt.drop,
+                         "  %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl
+                                 __COLUMN(".2f") "\n",
+                         str, XMIT(out->xmit_cnt.pps),
+                         DROP(out->xmit_cnt.drop), out->xmit_cnt.err,
+                         "drv_err/s", out->xmit_cnt.bavg, "bulk-avg");
+
+               stats_get_devmap_xmit(r, p, nr_cpus, NULL);
+       }
+
+       if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+               stats_get_devmap_xmit_multi(r, p, nr_cpus, NULL,
+                                           mask & SAMPLE_DEVMAP_XMIT_CNT);
+
+       if (sample_log_level & LL_DEFAULT ||
+           ((sample_log_level & LL_SIMPLE) && sample_err_exp)) {
+               sample_err_exp = false;
+               printf("\n");
+       }
+}
+
+int sample_setup_maps(struct bpf_map **maps)
+{
+       sample_n_cpus = libbpf_num_possible_cpus();
+
+       for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) {
+               sample_map[i] = maps[i];
+
+               switch (i) {
+               case MAP_RX:
+               case MAP_CPUMAP_KTHREAD:
+               case MAP_DEVMAP_XMIT:
+                       sample_map_count[i] = sample_n_cpus;
+                       break;
+               case MAP_REDIRECT_ERR:
+                       sample_map_count[i] =
+                               XDP_REDIRECT_ERR_MAX * sample_n_cpus;
+                       break;
+               case MAP_EXCEPTION:
+                       sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus;
+               case MAP_CPUMAP_ENQUEUE:
+                       sample_map_count[i] = sample_n_cpus * sample_n_cpus;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+               if (bpf_map__resize(sample_map[i], sample_map_count[i]) < 0)
+                       return -errno;
+       }
+       sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI];
+       return 0;
+}
+
+static int sample_setup_maps_mappings(void)
+{
+       for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) {
+               size_t size = sample_map_count[i] * sizeof(struct datarec);
+
+               sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                                     MAP_SHARED, bpf_map__fd(sample_map[i]), 0);
+               if (sample_mmap[i] == MAP_FAILED)
+                       return -errno;
+       }
+       return 0;
+}
+
+int __sample_init(int mask)
+{
+       sigset_t st;
+
+       sigemptyset(&st);
+       sigaddset(&st, SIGQUIT);
+       sigaddset(&st, SIGINT);
+       sigaddset(&st, SIGTERM);
+
+       if (sigprocmask(SIG_BLOCK, &st, NULL) < 0)
+               return -errno;
+
+       sample_sig_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK);
+       if (sample_sig_fd < 0)
+               return -errno;
+
+       sample_mask = mask;
+
+       return sample_setup_maps_mappings();
+}
+
+static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags)
+{
+       __u32 cur_prog_id = 0;
+       int ret;
+
+       if (prog_id) {
+               ret = bpf_get_link_xdp_id(ifindex, &cur_prog_id, xdp_flags);
+               if (ret < 0)
+                       return -errno;
+
+               if (prog_id != cur_prog_id) {
+                       print_always(
+                               "Program on ifindex %d does not match installed "
+                               "program, skipping unload\n",
+                               ifindex);
+                       return -ENOENT;
+               }
+       }
+
+       return bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+}
+
+int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
+                      bool force)
+{
+       int ret, xdp_flags = 0;
+       __u32 prog_id = 0;
+
+       if (sample_xdp_cnt == 32) {
+               fprintf(stderr,
+                       "Total limit for installed XDP programs in a sample reached\n");
+               return -ENOTSUP;
+       }
+
+       xdp_flags |= !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0;
+       xdp_flags |= generic ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE;
+       ret = bpf_set_link_xdp_fd(ifindex, bpf_program__fd(xdp_prog),
+                                 xdp_flags);
+       if (ret < 0) {
+               ret = -errno;
+               fprintf(stderr,
+                       "Failed to install program \"%s\" on ifindex %d, mode = %s, "
+                       "force = %s: %s\n",
+                       bpf_program__name(xdp_prog), ifindex,
+                       generic ? "skb" : "native", force ? "true" : "false",
+                       strerror(-ret));
+               return ret;
+       }
+
+       ret = bpf_get_link_xdp_id(ifindex, &prog_id, xdp_flags);
+       if (ret < 0) {
+               ret = -errno;
+               fprintf(stderr,
+                       "Failed to get XDP program id for ifindex %d, removing program: %s\n",
+                       ifindex, strerror(errno));
+               __sample_remove_xdp(ifindex, 0, xdp_flags);
+               return ret;
+       }
+       sample_xdp_progs[sample_xdp_cnt++] =
+               (struct xdp_desc){ ifindex, prog_id, xdp_flags };
+
+       return 0;
+}
+
+static void sample_summary_print(void)
+{
+       double period = sample_out.rx_cnt.pps;
+
+       if (sample_out.totals.rx) {
+               double pkts = sample_out.totals.rx;
+
+               print_always("  Packets received    : %'-10llu\n",
+                            sample_out.totals.rx);
+               print_always("  Average packets/s   : %'-10.0f\n",
+                            sample_round(pkts / period));
+       }
+       if (sample_out.totals.redir) {
+               double pkts = sample_out.totals.redir;
+
+               print_always("  Packets redirected  : %'-10llu\n",
+                            sample_out.totals.redir);
+               print_always("  Average redir/s     : %'-10.0f\n",
+                            sample_round(pkts / period));
+       }
+       if (sample_out.totals.drop)
+               print_always("  Rx dropped          : %'-10llu\n",
+                            sample_out.totals.drop);
+       if (sample_out.totals.drop_xmit)
+               print_always("  Tx dropped          : %'-10llu\n",
+                            sample_out.totals.drop_xmit);
+       if (sample_out.totals.err)
+               print_always("  Errors recorded     : %'-10llu\n",
+                            sample_out.totals.err);
+       if (sample_out.totals.xmit) {
+               double pkts = sample_out.totals.xmit;
+
+               print_always("  Packets transmitted : %'-10llu\n",
+                            sample_out.totals.xmit);
+               print_always("  Average transmit/s  : %'-10.0f\n",
+                            sample_round(pkts / period));
+       }
+}
+
+void sample_exit(int status)
+{
+       size_t size;
+
+       for (int i = 0; i < NUM_MAP; i++) {
+               size = sample_map_count[i] * sizeof(**sample_mmap);
+               munmap(sample_mmap[i], size);
+       }
+       while (sample_xdp_cnt--) {
+               int i = sample_xdp_cnt, ifindex, xdp_flags;
+               __u32 prog_id;
+
+               prog_id = sample_xdp_progs[i].prog_id;
+               ifindex = sample_xdp_progs[i].ifindex;
+               xdp_flags = sample_xdp_progs[i].flags;
+
+               __sample_remove_xdp(ifindex, prog_id, xdp_flags);
+       }
+       sample_summary_print();
+       close(sample_sig_fd);
+       exit(status);
+}
+
+static int sample_stats_collect(struct stats_record *rec)
+{
+       int i;
+
+       if (sample_mask & SAMPLE_RX_CNT)
+               map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt);
+
+       if (sample_mask & SAMPLE_REDIRECT_CNT)
+               map_collect_percpu(sample_mmap[MAP_REDIRECT_ERR], &rec->redir_err[0]);
+
+       if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) {
+               for (i = 1; i < XDP_REDIRECT_ERR_MAX; i++)
+                       map_collect_percpu(&sample_mmap[MAP_REDIRECT_ERR][i * sample_n_cpus],
+                                          &rec->redir_err[i]);
+       }
+
+       if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT)
+               for (i = 0; i < sample_n_cpus; i++)
+                       map_collect_percpu(&sample_mmap[MAP_CPUMAP_ENQUEUE][i * sample_n_cpus],
+                                          &rec->enq[i]);
+
+       if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT)
+               map_collect_percpu(sample_mmap[MAP_CPUMAP_KTHREAD],
+                                  &rec->kthread);
+
+       if (sample_mask & SAMPLE_EXCEPTION_CNT)
+               for (i = 0; i < XDP_ACTION_MAX; i++)
+                       map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus],
+                                          &rec->exception[i]);
+
+       if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT)
+               map_collect_percpu(sample_mmap[MAP_DEVMAP_XMIT], &rec->devmap_xmit);
+
+       if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) {
+               if (map_collect_percpu_devmap(bpf_map__fd(sample_map[MAP_DEVMAP_XMIT_MULTI]), rec) < 0)
+                       return -EINVAL;
+       }
+       return 0;
+}
+
+static void sample_summary_update(struct sample_output *out, int interval)
+{
+       sample_out.totals.rx += out->totals.rx;
+       sample_out.totals.redir += out->totals.redir;
+       sample_out.totals.drop += out->totals.drop;
+       sample_out.totals.drop_xmit += out->totals.drop_xmit;
+       sample_out.totals.err += out->totals.err;
+       sample_out.totals.xmit += out->totals.xmit;
+       sample_out.rx_cnt.pps += interval;
+}
+
+static void sample_stats_print(int mask, struct stats_record *cur,
+                              struct stats_record *prev, char *prog_name,
+                              int interval)
+{
+       struct sample_output out = {};
+
+       if (mask & SAMPLE_RX_CNT)
+               stats_get_rx_cnt(cur, prev, 0, &out);
+       if (mask & SAMPLE_REDIRECT_CNT)
+               stats_get_redirect_cnt(cur, prev, 0, &out);
+       if (mask & SAMPLE_REDIRECT_ERR_CNT)
+               stats_get_redirect_err_cnt(cur, prev, 0, &out);
+       if (mask & SAMPLE_EXCEPTION_CNT)
+               stats_get_exception_cnt(cur, prev, 0, &out);
+       if (mask & SAMPLE_DEVMAP_XMIT_CNT)
+               stats_get_devmap_xmit(cur, prev, 0, &out);
+       else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+               stats_get_devmap_xmit_multi(cur, prev, 0, &out,
+                                           mask & SAMPLE_DEVMAP_XMIT_CNT);
+       sample_summary_update(&out, interval);
+
+       stats_print(prog_name, mask, cur, prev, &out);
+}
+
+void sample_switch_mode(void)
+{
+       sample_log_level ^= LL_DEBUG - 1;
+}
+
+static int sample_signal_cb(void)
+{
+       struct signalfd_siginfo si;
+       int r;
+
+       r = read(sample_sig_fd, &si, sizeof(si));
+       if (r < 0)
+               return -errno;
+
+       switch (si.ssi_signo) {
+       case SIGQUIT:
+               sample_switch_mode();
+               printf("\n");
+               break;
+       default:
+               printf("\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+/* Pointer swap trick */
+static void swap(struct stats_record **a, struct stats_record **b)
+{
+       struct stats_record *tmp;
+
+       tmp = *a;
+       *a = *b;
+       *b = tmp;
+}
+
+static int sample_timer_cb(int timerfd, struct stats_record **rec,
+                          struct stats_record **prev, int interval)
+{
+       char line[64] = "Summary";
+       int ret;
+       __u64 t;
+
+       ret = read(timerfd, &t, sizeof(t));
+       if (ret < 0)
+               return -errno;
+
+       swap(prev, rec);
+       ret = sample_stats_collect(*rec);
+       if (ret < 0)
+               return ret;
+
+       if (sample_xdp_cnt == 2 && !(sample_mask & SAMPLE_SKIP_HEADING)) {
+               char fi[IFNAMSIZ];
+               char to[IFNAMSIZ];
+               const char *f, *t;
+
+               f = t = NULL;
+               if (if_indextoname(sample_xdp_progs[0].ifindex, fi))
+                       f = fi;
+               if (if_indextoname(sample_xdp_progs[1].ifindex, to))
+                       t = to;
+
+               snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?");
+       }
+
+       sample_stats_print(sample_mask, *rec, *prev, line, interval);
+       return 0;
+}
+
+int sample_run(int interval, void (*post_cb)(void *), void *ctx)
+{
+       struct timespec ts = { interval, 0 };
+       struct itimerspec its = { ts, ts };
+       struct stats_record *rec, *prev;
+       struct pollfd pfd[2] = {};
+       int timerfd, ret;
+
+       if (!interval) {
+               fprintf(stderr, "Incorrect interval 0\n");
+               return -EINVAL;
+       }
+       sample_interval = interval;
+       /* Pretty print numbers */
+       setlocale(LC_NUMERIC, "en_US.UTF-8");
+
+       timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK);
+       if (timerfd < 0)
+               return -errno;
+       timerfd_settime(timerfd, 0, &its, NULL);
+
+       pfd[0].fd = sample_sig_fd;
+       pfd[0].events = POLLIN;
+
+       pfd[1].fd = timerfd;
+       pfd[1].events = POLLIN;
+
+       ret = -ENOMEM;
+       rec = alloc_stats_record();
+       if (!rec)
+               goto end;
+       prev = alloc_stats_record();
+       if (!prev)
+               goto end_rec;
+
+       ret = sample_stats_collect(rec);
+       if (ret < 0)
+               goto end_rec_prev;
+
+       for (;;) {
+               ret = poll(pfd, 2, -1);
+               if (ret < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       else
+                               break;
+               }
+
+               if (pfd[0].revents & POLLIN)
+                       ret = sample_signal_cb();
+               else if (pfd[1].revents & POLLIN)
+                       ret = sample_timer_cb(timerfd, &rec, &prev, interval);
+
+               if (ret)
+                       break;
+
+               if (post_cb)
+                       post_cb(ctx);
+       }
+
+end_rec_prev:
+       free_stats_record(prev);
+end_rec:
+       free_stats_record(rec);
+end:
+       close(timerfd);
+
+       return ret;
+}
+
+const char *get_driver_name(int ifindex)
+{
+       struct ethtool_drvinfo drv = {};
+       char ifname[IF_NAMESIZE];
+       static char drvname[32];
+       struct ifreq ifr = {};
+       int fd, r = 0;
+
+       fd = socket(AF_INET, SOCK_DGRAM, 0);
+       if (fd < 0)
+               return "[error]";
+
+       if (!if_indextoname(ifindex, ifname))
+               goto end;
+
+       drv.cmd = ETHTOOL_GDRVINFO;
+       safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+       ifr.ifr_data = (void *)&drv;
+
+       r = ioctl(fd, SIOCETHTOOL, &ifr);
+       if (r)
+               goto end;
+
+       safe_strncpy(drvname, drv.driver, sizeof(drvname));
+
+       close(fd);
+       return drvname;
+
+end:
+       r = errno;
+       close(fd);
+       return r == EOPNOTSUPP ? "loopback" : "[error]";
+}
+
+int get_mac_addr(int ifindex, void *mac_addr)
+{
+       char ifname[IF_NAMESIZE];
+       struct ifreq ifr = {};
+       int fd, r;
+
+       fd = socket(AF_INET, SOCK_DGRAM, 0);
+       if (fd < 0)
+               return -errno;
+
+       if (!if_indextoname(ifindex, ifname)) {
+               r = -errno;
+               goto end;
+       }
+
+       safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+
+       r = ioctl(fd, SIOCGIFHWADDR, &ifr);
+       if (r) {
+               r = -errno;
+               goto end;
+       }
+
+       memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
+
+end:
+       close(fd);
+       return r;
+}
+
+__attribute__((constructor)) static void sample_ctor(void)
+{
+       if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) {
+               fprintf(stderr, "Failed to set libbpf strict mode: %s\n",
+                       strerror(errno));
+               /* Just exit, nothing to cleanup right now */
+               exit(EXIT_FAIL_BPF);
+       }
+}
diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h
new file mode 100644 (file)
index 0000000..d97465f
--- /dev/null
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef XDP_SAMPLE_USER_H
+#define XDP_SAMPLE_USER_H
+
+#include <bpf/libbpf.h>
+#include <linux/compiler.h>
+
+#include "xdp_sample_shared.h"
+
+enum stats_mask {
+       _SAMPLE_REDIRECT_MAP         = 1U << 0,
+       SAMPLE_RX_CNT                = 1U << 1,
+       SAMPLE_REDIRECT_ERR_CNT      = 1U << 2,
+       SAMPLE_CPUMAP_ENQUEUE_CNT    = 1U << 3,
+       SAMPLE_CPUMAP_KTHREAD_CNT    = 1U << 4,
+       SAMPLE_EXCEPTION_CNT         = 1U << 5,
+       SAMPLE_DEVMAP_XMIT_CNT       = 1U << 6,
+       SAMPLE_REDIRECT_CNT          = 1U << 7,
+       SAMPLE_REDIRECT_MAP_CNT      = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP,
+       SAMPLE_REDIRECT_ERR_MAP_CNT  = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP,
+       SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8,
+       SAMPLE_SKIP_HEADING          = 1U << 9,
+};
+
+/* Exit return codes */
+#define EXIT_OK                        0
+#define EXIT_FAIL              1
+#define EXIT_FAIL_OPTION       2
+#define EXIT_FAIL_XDP          3
+#define EXIT_FAIL_BPF          4
+#define EXIT_FAIL_MEM          5
+
+int sample_setup_maps(struct bpf_map **maps);
+int __sample_init(int mask);
+void sample_exit(int status);
+int sample_run(int interval, void (*post_cb)(void *), void *ctx);
+
+void sample_switch_mode(void);
+int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
+                      bool force);
+void sample_usage(char *argv[], const struct option *long_options,
+                 const char *doc, int mask, bool error);
+
+const char *get_driver_name(int ifindex);
+int get_mac_addr(int ifindex, void *mac_addr);
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
+__attribute__((unused))
+static inline char *safe_strncpy(char *dst, const char *src, size_t size)
+{
+       if (!size)
+               return dst;
+       strncpy(dst, src, size - 1);
+       dst[size - 1] = '\0';
+       return dst;
+}
+#pragma GCC diagnostic pop
+
+#define __attach_tp(name)                                                      \
+       ({                                                                     \
+               if (!bpf_program__is_tracing(skel->progs.name))                \
+                       return -EINVAL;                                        \
+               skel->links.name = bpf_program__attach(skel->progs.name);      \
+               if (!skel->links.name)                                         \
+                       return -errno;                                         \
+       })
+
+#define sample_init_pre_load(skel)                                             \
+       ({                                                                     \
+               skel->rodata->nr_cpus = libbpf_num_possible_cpus();            \
+               sample_setup_maps((struct bpf_map *[]){                        \
+                       skel->maps.rx_cnt, skel->maps.redir_err_cnt,           \
+                       skel->maps.cpumap_enqueue_cnt,                         \
+                       skel->maps.cpumap_kthread_cnt,                         \
+                       skel->maps.exception_cnt, skel->maps.devmap_xmit_cnt,  \
+                       skel->maps.devmap_xmit_cnt_multi });                   \
+       })
+
+#define DEFINE_SAMPLE_INIT(name)                                               \
+       static int sample_init(struct name *skel, int mask)                    \
+       {                                                                      \
+               int ret;                                                       \
+               ret = __sample_init(mask);                                     \
+               if (ret < 0)                                                   \
+                       return ret;                                            \
+               if (mask & SAMPLE_REDIRECT_MAP_CNT)                            \
+                       __attach_tp(tp_xdp_redirect_map);                      \
+               if (mask & SAMPLE_REDIRECT_CNT)                                \
+                       __attach_tp(tp_xdp_redirect);                          \
+               if (mask & SAMPLE_REDIRECT_ERR_MAP_CNT)                        \
+                       __attach_tp(tp_xdp_redirect_map_err);                  \
+               if (mask & SAMPLE_REDIRECT_ERR_CNT)                            \
+                       __attach_tp(tp_xdp_redirect_err);                      \
+               if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT)                          \
+                       __attach_tp(tp_xdp_cpumap_enqueue);                    \
+               if (mask & SAMPLE_CPUMAP_KTHREAD_CNT)                          \
+                       __attach_tp(tp_xdp_cpumap_kthread);                    \
+               if (mask & SAMPLE_EXCEPTION_CNT)                               \
+                       __attach_tp(tp_xdp_exception);                         \
+               if (mask & SAMPLE_DEVMAP_XMIT_CNT)                             \
+                       __attach_tp(tp_xdp_devmap_xmit);                       \
+               if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)                       \
+                       __attach_tp(tp_xdp_devmap_xmit_multi);                 \
+               return 0;                                                      \
+       }
+
+#endif
index 2db6925..791f31d 100644 (file)
@@ -84,7 +84,7 @@ struct bpf_lpm_trie_key {
 
 struct bpf_cgroup_storage_key {
        __u64   cgroup_inode_id;        /* cgroup inode id */
-       __u32   attach_type;            /* program attach type */
+       __u32   attach_type;            /* program attach type (enum bpf_attach_type) */
 };
 
 union bpf_iter_link_info {
@@ -993,6 +993,7 @@ enum bpf_attach_type {
        BPF_SK_SKB_VERDICT,
        BPF_SK_REUSEPORT_SELECT,
        BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+       BPF_PERF_EVENT,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
        BPF_LINK_TYPE_ITER = 4,
        BPF_LINK_TYPE_NETNS = 5,
        BPF_LINK_TYPE_XDP = 6,
+       BPF_LINK_TYPE_PERF_EVENT = 7,
 
        MAX_BPF_LINK_TYPE,
 };
@@ -1446,6 +1448,13 @@ union bpf_attr {
                                __aligned_u64   iter_info;      /* extra bpf_iter_link_info */
                                __u32           iter_info_len;  /* iter_info length */
                        };
+                       struct {
+                               /* black box user-provided value passed through
+                                * to BPF program at the execution time and
+                                * accessible through bpf_get_attach_cookie() BPF helper
+                                */
+                               __u64           bpf_cookie;
+                       } perf_event;
                };
        } link_create;
 
@@ -4847,6 +4856,27 @@ union bpf_attr {
  *             Get address of the traced function (for tracing and kprobe programs).
  *     Return
  *             Address of the traced function.
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ *     Description
+ *             Get bpf_cookie value provided (optionally) during the program
+ *             attachment. It might be different for each individual
+ *             attachment, even if BPF program itself is the same.
+ *             Expects BPF program context *ctx* as a first argument.
+ *
+ *             Supported for the following program types:
+ *                     - kprobe/uprobe;
+ *                     - tracepoint;
+ *                     - perf_event.
+ *     Return
+ *             Value specified by user at BPF link creation/attachment time
+ *             or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ *     Description
+ *             Get the struct pt_regs associated with **task**.
+ *     Return
+ *             A pointer to struct pt_regs.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -5023,6 +5053,8 @@ union bpf_attr {
        FN(timer_start),                \
        FN(timer_cancel),               \
        FN(get_func_ip),                \
+       FN(get_attach_cookie),          \
+       FN(task_pt_regs),               \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
index c86c3e9..47afae3 100644 (file)
@@ -48,4 +48,57 @@ struct ethtool_channels {
        __u32   combined_count;
 };
 
+#define ETHTOOL_FWVERS_LEN     32
+#define ETHTOOL_BUSINFO_LEN    32
+#define ETHTOOL_EROMVERS_LEN   32
+
+/**
+ * struct ethtool_drvinfo - general driver and device information
+ * @cmd: Command number = %ETHTOOL_GDRVINFO
+ * @driver: Driver short name.  This should normally match the name
+ *     in its bus driver structure (e.g. pci_driver::name).  Must
+ *     not be an empty string.
+ * @version: Driver version string; may be an empty string
+ * @fw_version: Firmware version string; may be an empty string
+ * @erom_version: Expansion ROM version string; may be an empty string
+ * @bus_info: Device bus address.  This should match the dev_name()
+ *     string for the underlying bus device, if there is one.  May be
+ *     an empty string.
+ * @reserved2: Reserved for future use; see the note on reserved space.
+ * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and
+ *     %ETHTOOL_SPFLAGS commands; also the number of strings in the
+ *     %ETH_SS_PRIV_FLAGS set
+ * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS
+ *     command; also the number of strings in the %ETH_SS_STATS set
+ * @testinfo_len: Number of results returned by the %ETHTOOL_TEST
+ *     command; also the number of strings in the %ETH_SS_TEST set
+ * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM
+ *     and %ETHTOOL_SEEPROM commands, in bytes
+ * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS
+ *     command, in bytes
+ *
+ * Users can use the %ETHTOOL_GSSET_INFO command to get the number of
+ * strings in any string set (from Linux 2.6.34).
+ *
+ * Drivers should set at most @driver, @version, @fw_version and
+ * @bus_info in their get_drvinfo() implementation.  The ethtool
+ * core fills in the other fields using other driver operations.
+ */
+struct ethtool_drvinfo {
+       __u32   cmd;
+       char    driver[32];
+       char    version[32];
+       char    fw_version[ETHTOOL_FWVERS_LEN];
+       char    bus_info[ETHTOOL_BUSINFO_LEN];
+       char    erom_version[ETHTOOL_EROMVERS_LEN];
+       char    reserved2[12];
+       __u32   n_priv_flags;
+       __u32   n_stats;
+       __u32   testinfo_len;
+       __u32   eedump_len;
+       __u32   regdump_len;
+};
+
+#define ETHTOOL_GDRVINFO       0x00000003
+
 #endif /* _UAPI_LINUX_ETHTOOL_H */
index ec14aa7..74c3b73 100644 (file)
@@ -4,8 +4,9 @@
 RM ?= rm
 srctree = $(abs_srctree)
 
+VERSION_SCRIPT := libbpf.map
 LIBBPF_VERSION := $(shell \
-       grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | \
+       grep -oE '^LIBBPF_([0-9.]+)' $(VERSION_SCRIPT) | \
        sort -rV | head -n1 | cut -d'_' -f2)
 LIBBPF_MAJOR_VERSION := $(firstword $(subst ., ,$(LIBBPF_VERSION)))
 
@@ -110,7 +111,6 @@ SHARED_OBJDIR       := $(OUTPUT)sharedobjs/
 STATIC_OBJDIR  := $(OUTPUT)staticobjs/
 BPF_IN_SHARED  := $(SHARED_OBJDIR)libbpf-in.o
 BPF_IN_STATIC  := $(STATIC_OBJDIR)libbpf-in.o
-VERSION_SCRIPT := libbpf.map
 BPF_HELPER_DEFS        := $(OUTPUT)bpf_helper_defs.h
 
 LIB_TARGET     := $(addprefix $(OUTPUT),$(LIB_TARGET))
@@ -163,10 +163,10 @@ $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
 
 $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
 
-$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED)
+$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED) $(VERSION_SCRIPT)
        $(QUIET_LINK)$(CC) $(LDFLAGS) \
                --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
-               -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -lz -o $@
+               -Wl,--version-script=$(VERSION_SCRIPT) $< -lelf -lz -o $@
        @ln -sf $(@F) $(OUTPUT)libbpf.so
        @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION)
 
@@ -181,7 +181,7 @@ $(OUTPUT)libbpf.pc:
 
 check: check_abi
 
-check_abi: $(OUTPUT)libbpf.so
+check_abi: $(OUTPUT)libbpf.so $(VERSION_SCRIPT)
        @if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then  \
                echo "Warning: Num of global symbols in $(BPF_IN_SHARED)"        \
                     "($(GLOBAL_SYM_COUNT)) does NOT match with num of"  \
index 86dcac4..2401fad 100644 (file)
@@ -684,8 +684,13 @@ int bpf_link_create(int prog_fd, int target_fd,
        iter_info_len = OPTS_GET(opts, iter_info_len, 0);
        target_btf_id = OPTS_GET(opts, target_btf_id, 0);
 
-       if (iter_info_len && target_btf_id)
-               return libbpf_err(-EINVAL);
+       /* validate we don't have unexpected combinations of non-zero fields */
+       if (iter_info_len || target_btf_id) {
+               if (iter_info_len && target_btf_id)
+                       return libbpf_err(-EINVAL);
+               if (!OPTS_ZEROED(opts, target_btf_id))
+                       return libbpf_err(-EINVAL);
+       }
 
        memset(&attr, 0, sizeof(attr));
        attr.link_create.prog_fd = prog_fd;
@@ -693,14 +698,27 @@ int bpf_link_create(int prog_fd, int target_fd,
        attr.link_create.attach_type = attach_type;
        attr.link_create.flags = OPTS_GET(opts, flags, 0);
 
-       if (iter_info_len) {
-               attr.link_create.iter_info =
-                       ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0));
-               attr.link_create.iter_info_len = iter_info_len;
-       } else if (target_btf_id) {
+       if (target_btf_id) {
                attr.link_create.target_btf_id = target_btf_id;
+               goto proceed;
        }
 
+       switch (attach_type) {
+       case BPF_TRACE_ITER:
+               attr.link_create.iter_info = ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0));
+               attr.link_create.iter_info_len = iter_info_len;
+               break;
+       case BPF_PERF_EVENT:
+               attr.link_create.perf_event.bpf_cookie = OPTS_GET(opts, perf_event.bpf_cookie, 0);
+               if (!OPTS_ZEROED(opts, perf_event))
+                       return libbpf_err(-EINVAL);
+               break;
+       default:
+               if (!OPTS_ZEROED(opts, flags))
+                       return libbpf_err(-EINVAL);
+               break;
+       }
+proceed:
        fd = sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr));
        return libbpf_err_errno(fd);
 }
index 4f758f8..6fffb3c 100644 (file)
@@ -177,8 +177,14 @@ struct bpf_link_create_opts {
        union bpf_iter_link_info *iter_info;
        __u32 iter_info_len;
        __u32 target_btf_id;
+       union {
+               struct {
+                       __u64 bpf_cookie;
+               } perf_event;
+       };
+       size_t :0;
 };
-#define bpf_link_create_opts__last_field target_btf_id
+#define bpf_link_create_opts__last_field perf_event
 
 LIBBPF_API int bpf_link_create(int prog_fd, int target_fd,
                               enum bpf_attach_type attach_type,
index cb106e8..88d8825 100644 (file)
@@ -193,6 +193,8 @@ enum kern_feature_id {
        FEAT_MODULE_BTF,
        /* BTF_KIND_FLOAT support */
        FEAT_BTF_FLOAT,
+       /* BPF perf link support */
+       FEAT_PERF_LINK,
        __FEAT_CNT,
 };
 
@@ -4337,6 +4339,37 @@ static int probe_module_btf(void)
        return !err;
 }
 
+static int probe_perf_link(void)
+{
+       struct bpf_load_program_attr attr;
+       struct bpf_insn insns[] = {
+               BPF_MOV64_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       };
+       int prog_fd, link_fd, err;
+
+       memset(&attr, 0, sizeof(attr));
+       attr.prog_type = BPF_PROG_TYPE_TRACEPOINT;
+       attr.insns = insns;
+       attr.insns_cnt = ARRAY_SIZE(insns);
+       attr.license = "GPL";
+       prog_fd = bpf_load_program_xattr(&attr, NULL, 0);
+       if (prog_fd < 0)
+               return -errno;
+
+       /* use invalid perf_event FD to get EBADF, if link is supported;
+        * otherwise EINVAL should be returned
+        */
+       link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL);
+       err = -errno; /* close() can clobber errno */
+
+       if (link_fd >= 0)
+               close(link_fd);
+       close(prog_fd);
+
+       return link_fd < 0 && err == -EBADF;
+}
+
 enum kern_feature_result {
        FEAT_UNKNOWN = 0,
        FEAT_SUPPORTED = 1,
@@ -4387,6 +4420,9 @@ static struct kern_feature_desc {
        [FEAT_BTF_FLOAT] = {
                "BTF_KIND_FLOAT support", probe_kern_btf_float,
        },
+       [FEAT_PERF_LINK] = {
+               "BPF perf link support", probe_perf_link,
+       },
 };
 
 static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
@@ -5277,11 +5313,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
                                }
                                insn[1].imm = ext->kcfg.data_off;
                        } else /* EXT_KSYM */ {
-                               if (ext->ksym.type_id) { /* typed ksyms */
+                               if (ext->ksym.type_id && ext->is_set) { /* typed ksyms */
                                        insn[0].src_reg = BPF_PSEUDO_BTF_ID;
                                        insn[0].imm = ext->ksym.kernel_btf_id;
                                        insn[1].imm = ext->ksym.kernel_btf_obj_fd;
-                               } else { /* typeless ksyms */
+                               } else { /* typeless ksyms or unresolved typed ksyms */
                                        insn[0].imm = (__u32)ext->ksym.addr;
                                        insn[1].imm = ext->ksym.addr >> 32;
                                }
@@ -6608,11 +6644,8 @@ static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name,
                                break;
                }
        }
-       if (id <= 0) {
-               pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
-                       __btf_kind_str(kind), ksym_name);
+       if (id <= 0)
                return -ESRCH;
-       }
 
        *res_btf = btf;
        *res_btf_fd = btf_fd;
@@ -6629,8 +6662,13 @@ static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
        struct btf *btf = NULL;
 
        id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd);
-       if (id < 0)
+       if (id == -ESRCH && ext->is_weak) {
+               return 0;
+       } else if (id < 0) {
+               pr_warn("extern (var ksym) '%s': not found in kernel BTF\n",
+                       ext->name);
                return id;
+       }
 
        /* find local type_id */
        local_type_id = ext->ksym.type_id;
@@ -8808,7 +8846,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 
 struct bpf_link {
        int (*detach)(struct bpf_link *link);
-       int (*destroy)(struct bpf_link *link);
+       void (*dealloc)(struct bpf_link *link);
        char *pin_path;         /* NULL, if not pinned */
        int fd;                 /* hook FD, -1 if not applicable */
        bool disconnected;
@@ -8847,11 +8885,12 @@ int bpf_link__destroy(struct bpf_link *link)
 
        if (!link->disconnected && link->detach)
                err = link->detach(link);
-       if (link->destroy)
-               link->destroy(link);
        if (link->pin_path)
                free(link->pin_path);
-       free(link);
+       if (link->dealloc)
+               link->dealloc(link);
+       else
+               free(link);
 
        return libbpf_err(err);
 }
@@ -8948,23 +8987,42 @@ int bpf_link__unpin(struct bpf_link *link)
        return 0;
 }
 
-static int bpf_link__detach_perf_event(struct bpf_link *link)
+struct bpf_link_perf {
+       struct bpf_link link;
+       int perf_event_fd;
+};
+
+static int bpf_link_perf_detach(struct bpf_link *link)
 {
-       int err;
+       struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link);
+       int err = 0;
 
-       err = ioctl(link->fd, PERF_EVENT_IOC_DISABLE, 0);
-       if (err)
+       if (ioctl(perf_link->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0) < 0)
                err = -errno;
 
+       if (perf_link->perf_event_fd != link->fd)
+               close(perf_link->perf_event_fd);
        close(link->fd);
+
        return libbpf_err(err);
 }
 
-struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd)
+static void bpf_link_perf_dealloc(struct bpf_link *link)
+{
+       struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link);
+
+       free(perf_link);
+}
+
+struct bpf_link *bpf_program__attach_perf_event_opts(struct bpf_program *prog, int pfd,
+                                                    const struct bpf_perf_event_opts *opts)
 {
        char errmsg[STRERR_BUFSIZE];
-       struct bpf_link *link;
-       int prog_fd, err;
+       struct bpf_link_perf *link;
+       int prog_fd, link_fd = -1, err;
+
+       if (!OPTS_VALID(opts, bpf_perf_event_opts))
+               return libbpf_err_ptr(-EINVAL);
 
        if (pfd < 0) {
                pr_warn("prog '%s': invalid perf event FD %d\n",
@@ -8981,27 +9039,59 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pf
        link = calloc(1, sizeof(*link));
        if (!link)
                return libbpf_err_ptr(-ENOMEM);
-       link->detach = &bpf_link__detach_perf_event;
-       link->fd = pfd;
+       link->link.detach = &bpf_link_perf_detach;
+       link->link.dealloc = &bpf_link_perf_dealloc;
+       link->perf_event_fd = pfd;
 
-       if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) {
-               err = -errno;
-               free(link);
-               pr_warn("prog '%s': failed to attach to pfd %d: %s\n",
-                       prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-               if (err == -EPROTO)
-                       pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n",
-                               prog->name, pfd);
-               return libbpf_err_ptr(err);
+       if (kernel_supports(prog->obj, FEAT_PERF_LINK)) {
+               DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts,
+                       .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0));
+
+               link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, &link_opts);
+               if (link_fd < 0) {
+                       err = -errno;
+                       pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n",
+                               prog->name, pfd,
+                               err, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
+                       goto err_out;
+               }
+               link->link.fd = link_fd;
+       } else {
+               if (OPTS_GET(opts, bpf_cookie, 0)) {
+                       pr_warn("prog '%s': user context value is not supported\n", prog->name);
+                       err = -EOPNOTSUPP;
+                       goto err_out;
+               }
+
+               if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) {
+                       err = -errno;
+                       pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n",
+                               prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
+                       if (err == -EPROTO)
+                               pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n",
+                                       prog->name, pfd);
+                       goto err_out;
+               }
+               link->link.fd = pfd;
        }
        if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
                err = -errno;
-               free(link);
-               pr_warn("prog '%s': failed to enable pfd %d: %s\n",
+               pr_warn("prog '%s': failed to enable perf_event FD %d: %s\n",
                        prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-               return libbpf_err_ptr(err);
+               goto err_out;
        }
-       return link;
+
+       return &link->link;
+err_out:
+       if (link_fd >= 0)
+               close(link_fd);
+       free(link);
+       return libbpf_err_ptr(err);
+}
+
+struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd)
+{
+       return bpf_program__attach_perf_event_opts(prog, pfd, NULL);
 }
 
 /*
@@ -9062,13 +9152,19 @@ static int determine_uprobe_retprobe_bit(void)
        return parse_uint_from_file(file, "config:%d\n");
 }
 
+#define PERF_UPROBE_REF_CTR_OFFSET_BITS 32
+#define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32
+
 static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
-                                uint64_t offset, int pid)
+                                uint64_t offset, int pid, size_t ref_ctr_off)
 {
        struct perf_event_attr attr = {};
        char errmsg[STRERR_BUFSIZE];
        int type, pfd, err;
 
+       if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS))
+               return -EINVAL;
+
        type = uprobe ? determine_uprobe_perf_type()
                      : determine_kprobe_perf_type();
        if (type < 0) {
@@ -9091,6 +9187,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
        }
        attr.size = sizeof(attr);
        attr.type = type;
+       attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */
        attr.config2 = offset;           /* kprobe_addr or probe_offset */
 
@@ -9112,8 +9209,9 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
 struct bpf_link *
 bpf_program__attach_kprobe_opts(struct bpf_program *prog,
                                const char *func_name,
-                               struct bpf_kprobe_opts *opts)
+                               const struct bpf_kprobe_opts *opts)
 {
+       DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
        char errmsg[STRERR_BUFSIZE];
        struct bpf_link *link;
        unsigned long offset;
@@ -9125,16 +9223,17 @@ bpf_program__attach_kprobe_opts(struct bpf_program *prog,
 
        retprobe = OPTS_GET(opts, retprobe, false);
        offset = OPTS_GET(opts, offset, 0);
+       pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);
 
        pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name,
-                                   offset, -1 /* pid */);
+                                   offset, -1 /* pid */, 0 /* ref_ctr_off */);
        if (pfd < 0) {
                pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n",
                        prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
                        libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
                return libbpf_err_ptr(pfd);
        }
-       link = bpf_program__attach_perf_event(prog, pfd);
+       link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
        err = libbpf_get_error(link);
        if (err) {
                close(pfd);
@@ -9189,17 +9288,27 @@ static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
        return link;
 }
 
-struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,
-                                           bool retprobe, pid_t pid,
-                                           const char *binary_path,
-                                           size_t func_offset)
+LIBBPF_API struct bpf_link *
+bpf_program__attach_uprobe_opts(struct bpf_program *prog, pid_t pid,
+                               const char *binary_path, size_t func_offset,
+                               const struct bpf_uprobe_opts *opts)
 {
+       DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
        char errmsg[STRERR_BUFSIZE];
        struct bpf_link *link;
+       size_t ref_ctr_off;
        int pfd, err;
+       bool retprobe;
+
+       if (!OPTS_VALID(opts, bpf_uprobe_opts))
+               return libbpf_err_ptr(-EINVAL);
 
-       pfd = perf_event_open_probe(true /* uprobe */, retprobe,
-                                   binary_path, func_offset, pid);
+       retprobe = OPTS_GET(opts, retprobe, false);
+       ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0);
+       pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);
+
+       pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path,
+                                   func_offset, pid, ref_ctr_off);
        if (pfd < 0) {
                pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n",
                        prog->name, retprobe ? "uretprobe" : "uprobe",
@@ -9207,7 +9316,7 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,
                        libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
                return libbpf_err_ptr(pfd);
        }
-       link = bpf_program__attach_perf_event(prog, pfd);
+       link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
        err = libbpf_get_error(link);
        if (err) {
                close(pfd);
@@ -9220,6 +9329,16 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,
        return link;
 }
 
+struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,
+                                           bool retprobe, pid_t pid,
+                                           const char *binary_path,
+                                           size_t func_offset)
+{
+       DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts, .retprobe = retprobe);
+
+       return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts);
+}
+
 static int determine_tracepoint_id(const char *tp_category,
                                   const char *tp_name)
 {
@@ -9270,14 +9389,21 @@ static int perf_event_open_tracepoint(const char *tp_category,
        return pfd;
 }
 
-struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog,
-                                               const char *tp_category,
-                                               const char *tp_name)
+struct bpf_link *bpf_program__attach_tracepoint_opts(struct bpf_program *prog,
+                                                    const char *tp_category,
+                                                    const char *tp_name,
+                                                    const struct bpf_tracepoint_opts *opts)
 {
+       DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
        char errmsg[STRERR_BUFSIZE];
        struct bpf_link *link;
        int pfd, err;
 
+       if (!OPTS_VALID(opts, bpf_tracepoint_opts))
+               return libbpf_err_ptr(-EINVAL);
+
+       pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);
+
        pfd = perf_event_open_tracepoint(tp_category, tp_name);
        if (pfd < 0) {
                pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n",
@@ -9285,7 +9411,7 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog,
                        libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
                return libbpf_err_ptr(pfd);
        }
-       link = bpf_program__attach_perf_event(prog, pfd);
+       link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
        err = libbpf_get_error(link);
        if (err) {
                close(pfd);
@@ -9297,6 +9423,13 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog,
        return link;
 }
 
+struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog,
+                                               const char *tp_category,
+                                               const char *tp_name)
+{
+       return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL);
+}
+
 static struct bpf_link *attach_tp(const struct bpf_sec_def *sec,
                                  struct bpf_program *prog)
 {
index 1271d99..f177d89 100644 (file)
@@ -104,17 +104,6 @@ struct bpf_object_open_opts {
 };
 #define bpf_object_open_opts__last_field btf_custom_path
 
-struct bpf_kprobe_opts {
-       /* size of this struct, for forward/backward compatiblity */
-       size_t sz;
-       /* function's offset to install kprobe to */
-       unsigned long offset;
-       /* kprobe is return probe */
-       bool retprobe;
-       size_t :0;
-};
-#define bpf_kprobe_opts__last_field retprobe
-
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 LIBBPF_API struct bpf_object *
 bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts);
@@ -255,24 +244,86 @@ LIBBPF_API int bpf_link__destroy(struct bpf_link *link);
 
 LIBBPF_API struct bpf_link *
 bpf_program__attach(struct bpf_program *prog);
+
+struct bpf_perf_event_opts {
+       /* size of this struct, for forward/backward compatiblity */
+       size_t sz;
+       /* custom user-provided value fetchable through bpf_get_attach_cookie() */
+       __u64 bpf_cookie;
+};
+#define bpf_perf_event_opts__last_field bpf_cookie
+
 LIBBPF_API struct bpf_link *
 bpf_program__attach_perf_event(struct bpf_program *prog, int pfd);
+
+LIBBPF_API struct bpf_link *
+bpf_program__attach_perf_event_opts(struct bpf_program *prog, int pfd,
+                                   const struct bpf_perf_event_opts *opts);
+
+struct bpf_kprobe_opts {
+       /* size of this struct, for forward/backward compatiblity */
+       size_t sz;
+       /* custom user-provided value fetchable through bpf_get_attach_cookie() */
+       __u64 bpf_cookie;
+       /* function's offset to install kprobe to */
+       unsigned long offset;
+       /* kprobe is return probe */
+       bool retprobe;
+       size_t :0;
+};
+#define bpf_kprobe_opts__last_field retprobe
+
 LIBBPF_API struct bpf_link *
 bpf_program__attach_kprobe(struct bpf_program *prog, bool retprobe,
                           const char *func_name);
 LIBBPF_API struct bpf_link *
 bpf_program__attach_kprobe_opts(struct bpf_program *prog,
                                 const char *func_name,
-                                struct bpf_kprobe_opts *opts);
+                                const struct bpf_kprobe_opts *opts);
+
+struct bpf_uprobe_opts {
+       /* size of this struct, for forward/backward compatiblity */
+       size_t sz;
+       /* offset of kernel reference counted USDT semaphore, added in
+        * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe")
+        */
+       size_t ref_ctr_offset;
+       /* custom user-provided value fetchable through bpf_get_attach_cookie() */
+       __u64 bpf_cookie;
+       /* uprobe is return probe, invoked at function return time */
+       bool retprobe;
+       size_t :0;
+};
+#define bpf_uprobe_opts__last_field retprobe
+
 LIBBPF_API struct bpf_link *
 bpf_program__attach_uprobe(struct bpf_program *prog, bool retprobe,
                           pid_t pid, const char *binary_path,
                           size_t func_offset);
 LIBBPF_API struct bpf_link *
+bpf_program__attach_uprobe_opts(struct bpf_program *prog, pid_t pid,
+                               const char *binary_path, size_t func_offset,
+                               const struct bpf_uprobe_opts *opts);
+
+struct bpf_tracepoint_opts {
+       /* size of this struct, for forward/backward compatiblity */
+       size_t sz;
+       /* custom user-provided value fetchable through bpf_get_attach_cookie() */
+       __u64 bpf_cookie;
+};
+#define bpf_tracepoint_opts__last_field bpf_cookie
+
+LIBBPF_API struct bpf_link *
 bpf_program__attach_tracepoint(struct bpf_program *prog,
                               const char *tp_category,
                               const char *tp_name);
 LIBBPF_API struct bpf_link *
+bpf_program__attach_tracepoint_opts(struct bpf_program *prog,
+                                   const char *tp_category,
+                                   const char *tp_name,
+                                   const struct bpf_tracepoint_opts *opts);
+
+LIBBPF_API struct bpf_link *
 bpf_program__attach_raw_tracepoint(struct bpf_program *prog,
                                   const char *tp_name);
 LIBBPF_API struct bpf_link *
index 58e0fb2..bbc53bb 100644 (file)
@@ -374,6 +374,9 @@ LIBBPF_0.5.0 {
                bpf_map__pin_path;
                bpf_map_lookup_and_delete_elem_flags;
                bpf_program__attach_kprobe_opts;
+               bpf_program__attach_perf_event_opts;
+               bpf_program__attach_tracepoint_opts;
+               bpf_program__attach_uprobe_opts;
                bpf_object__gen_loader;
                btf__load_from_kernel_by_id;
                btf__load_from_kernel_by_id_split;
index f7b691d..533b021 100644 (file)
@@ -196,6 +196,17 @@ void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz,
                     size_t cur_cnt, size_t max_cnt, size_t add_cnt);
 int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt);
 
+static inline bool libbpf_is_mem_zeroed(const char *p, ssize_t len)
+{
+       while (len > 0) {
+               if (*p)
+                       return false;
+               p++;
+               len--;
+       }
+       return true;
+}
+
 static inline bool libbpf_validate_opts(const char *opts,
                                        size_t opts_sz, size_t user_sz,
                                        const char *type_name)
@@ -204,16 +215,9 @@ static inline bool libbpf_validate_opts(const char *opts,
                pr_warn("%s size (%zu) is too small\n", type_name, user_sz);
                return false;
        }
-       if (user_sz > opts_sz) {
-               size_t i;
-
-               for (i = opts_sz; i < user_sz; i++) {
-                       if (opts[i]) {
-                               pr_warn("%s has non-zero extra bytes\n",
-                                       type_name);
-                               return false;
-                       }
-               }
+       if (!libbpf_is_mem_zeroed(opts + opts_sz, (ssize_t)user_sz - opts_sz)) {
+               pr_warn("%s has non-zero extra bytes\n", type_name);
+               return false;
        }
        return true;
 }
@@ -233,6 +237,14 @@ static inline bool libbpf_validate_opts(const char *opts,
                        (opts)->field = value;  \
        } while (0)
 
+#define OPTS_ZEROED(opts, last_nonzero_field)                                \
+({                                                                           \
+       ssize_t __off = offsetofend(typeof(*(opts)), last_nonzero_field);     \
+       !(opts) || libbpf_is_mem_zeroed((const void *)opts + __off,           \
+                                       (opts)->sz - __off);                  \
+})
+
+
 int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
 int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
index 2a58b7b..866531c 100644 (file)
@@ -79,7 +79,7 @@ TEST_PROGS := test_kmod.sh \
 
 TEST_PROGS_EXTENDED := with_addr.sh \
        with_tunnels.sh \
-       test_xdp_vlan.sh
+       test_xdp_vlan.sh test_bpftool.py
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
@@ -187,6 +187,8 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL)
                    BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) &&      \
                    cp $(SCRATCH_DIR)/runqslower $@
 
+TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL)
+
 $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ)
 
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
index 029589c..b1ede6f 100644 (file)
 SEC("struct_ops/"#name) \
 BPF_PROG(name, args)
 
+#ifndef SOL_TCP
+#define SOL_TCP 6
+#endif
+
 #define tcp_jiffies32 ((__u32)bpf_jiffies64())
 
 struct sock_common {
@@ -27,6 +31,7 @@ enum sk_pacing {
 
 struct sock {
        struct sock_common      __sk_common;
+#define sk_state               __sk_common.skc_state
        unsigned long           sk_pacing_rate;
        __u32                   sk_pacing_status; /* see enum sk_pacing */
 } __attribute__((preserve_access_index));
@@ -203,6 +208,20 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
        return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
 }
 
+static __always_inline bool tcp_cc_eq(const char *a, const char *b)
+{
+       int i;
+
+       for (i = 0; i < TCP_CA_NAME_MAX; i++) {
+               if (a[i] != b[i])
+                       return false;
+               if (!a[i])
+                       break;
+       }
+
+       return true;
+}
+
 extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
 extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
 
index d685768..7e9f637 100644 (file)
@@ -218,13 +218,18 @@ static int connect_fd_to_addr(int fd,
        return 0;
 }
 
-int connect_to_fd(int server_fd, int timeout_ms)
+static const struct network_helper_opts default_opts;
+
+int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
 {
        struct sockaddr_storage addr;
        struct sockaddr_in *addr_in;
        socklen_t addrlen, optlen;
        int fd, type;
 
+       if (!opts)
+               opts = &default_opts;
+
        optlen = sizeof(type);
        if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) {
                log_err("getsockopt(SOL_TYPE)");
@@ -244,7 +249,12 @@ int connect_to_fd(int server_fd, int timeout_ms)
                return -1;
        }
 
-       if (settimeo(fd, timeout_ms))
+       if (settimeo(fd, opts->timeout_ms))
+               goto error_close;
+
+       if (opts->cc && opts->cc[0] &&
+           setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc,
+                      strlen(opts->cc) + 1))
                goto error_close;
 
        if (connect_fd_to_addr(fd, &addr, addrlen))
@@ -257,6 +267,15 @@ error_close:
        return -1;
 }
 
+int connect_to_fd(int server_fd, int timeout_ms)
+{
+       struct network_helper_opts opts = {
+               .timeout_ms = timeout_ms,
+       };
+
+       return connect_to_fd_opts(server_fd, &opts);
+}
+
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
 {
        struct sockaddr_storage addr;
index c59a8f6..da7e132 100644 (file)
@@ -17,6 +17,11 @@ typedef __u16 __sum16;
 #define VIP_NUM 5
 #define MAGIC_BYTES 123
 
+struct network_helper_opts {
+       const char *cc;
+       int timeout_ms;
+};
+
 /* ipv4 test vector */
 struct ipv4_packet {
        struct ethhdr eth;
@@ -41,6 +46,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str,
                            unsigned int nr_listens);
 void free_fds(int *fds, unsigned int nr_close_fds);
 int connect_to_fd(int server_fd, int timeout_ms);
+int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts);
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
 int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
                     int timeout_ms);
index ec11e20..bf307bb 100644 (file)
@@ -2,79 +2,28 @@
 #include <test_progs.h>
 #include "test_attach_probe.skel.h"
 
-#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2
-
-#define OP_RT_RA_MASK   0xffff0000UL
-#define LIS_R2          0x3c400000UL
-#define ADDIS_R2_R12    0x3c4c0000UL
-#define ADDI_R2_R2      0x38420000UL
-
-static ssize_t get_offset(ssize_t addr, ssize_t base)
-{
-       u32 *insn = (u32 *) addr;
-
-       /*
-        * A PPC64 ABIv2 function may have a local and a global entry
-        * point. We need to use the local entry point when patching
-        * functions, so identify and step over the global entry point
-        * sequence.
-        *
-        * The global entry point sequence is always of the form:
-        *
-        * addis r2,r12,XXXX
-        * addi  r2,r2,XXXX
-        *
-        * A linker optimisation may convert the addis to lis:
-        *
-        * lis   r2,XXXX
-        * addi  r2,r2,XXXX
-        */
-       if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
-            ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
-           ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2))
-               return (ssize_t)(insn + 2) - base;
-       else
-               return addr - base;
-}
-#else
-#define get_offset(addr, base) (addr - base)
-#endif
-
-ssize_t get_base_addr() {
-       size_t start, offset;
-       char buf[256];
-       FILE *f;
-
-       f = fopen("/proc/self/maps", "r");
-       if (!f)
-               return -errno;
-
-       while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n",
-                     &start, buf, &offset) == 3) {
-               if (strcmp(buf, "r-xp") == 0) {
-                       fclose(f);
-                       return start - offset;
-               }
-       }
-
-       fclose(f);
-       return -EINVAL;
-}
+/* this is how USDT semaphore is actually defined, except volatile modifier */
+volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes")));
 
 void test_attach_probe(void)
 {
+       DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts);
        int duration = 0;
        struct bpf_link *kprobe_link, *kretprobe_link;
        struct bpf_link *uprobe_link, *uretprobe_link;
        struct test_attach_probe* skel;
        size_t uprobe_offset;
-       ssize_t base_addr;
+       ssize_t base_addr, ref_ctr_offset;
 
        base_addr = get_base_addr();
        if (CHECK(base_addr < 0, "get_base_addr",
                  "failed to find base addr: %zd", base_addr))
                return;
-       uprobe_offset = get_offset((size_t)&get_base_addr, base_addr);
+       uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr);
+
+       ref_ctr_offset = get_rel_offset((uintptr_t)&uprobe_ref_ctr);
+       if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset"))
+               return;
 
        skel = test_attach_probe__open_and_load();
        if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
@@ -96,20 +45,28 @@ void test_attach_probe(void)
                goto cleanup;
        skel->links.handle_kretprobe = kretprobe_link;
 
-       uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe,
-                                                false /* retprobe */,
-                                                0 /* self pid */,
-                                                "/proc/self/exe",
-                                                uprobe_offset);
+       ASSERT_EQ(uprobe_ref_ctr, 0, "uprobe_ref_ctr_before");
+
+       uprobe_opts.retprobe = false;
+       uprobe_opts.ref_ctr_offset = ref_ctr_offset;
+       uprobe_link = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe,
+                                                     0 /* self pid */,
+                                                     "/proc/self/exe",
+                                                     uprobe_offset,
+                                                     &uprobe_opts);
        if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe"))
                goto cleanup;
        skel->links.handle_uprobe = uprobe_link;
 
-       uretprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uretprobe,
-                                                   true /* retprobe */,
-                                                   -1 /* any pid */,
-                                                   "/proc/self/exe",
-                                                   uprobe_offset);
+       ASSERT_GT(uprobe_ref_ctr, 0, "uprobe_ref_ctr_after");
+
+       /* if uprobe uses ref_ctr, uretprobe has to use ref_ctr as well */
+       uprobe_opts.retprobe = true;
+       uprobe_opts.ref_ctr_offset = ref_ctr_offset;
+       uretprobe_link = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe,
+                                                        -1 /* any pid */,
+                                                        "/proc/self/exe",
+                                                        uprobe_offset, &uprobe_opts);
        if (!ASSERT_OK_PTR(uretprobe_link, "attach_uretprobe"))
                goto cleanup;
        skel->links.handle_uretprobe = uretprobe_link;
@@ -136,4 +93,5 @@ void test_attach_probe(void)
 
 cleanup:
        test_attach_probe__destroy(skel);
+       ASSERT_EQ(uprobe_ref_ctr, 0, "uprobe_ref_ctr_cleanup");
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
new file mode 100644 (file)
index 0000000..5eea3c3
--- /dev/null
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <test_progs.h>
+#include "test_bpf_cookie.skel.h"
+
+static void kprobe_subtest(struct test_bpf_cookie *skel)
+{
+       DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts);
+       struct bpf_link *link1 = NULL, *link2 = NULL;
+       struct bpf_link *retlink1 = NULL, *retlink2 = NULL;
+
+       /* attach two kprobes */
+       opts.bpf_cookie = 0x1;
+       opts.retprobe = false;
+       link1 = bpf_program__attach_kprobe_opts(skel->progs.handle_kprobe,
+                                                SYS_NANOSLEEP_KPROBE_NAME, &opts);
+       if (!ASSERT_OK_PTR(link1, "link1"))
+               goto cleanup;
+
+       opts.bpf_cookie = 0x2;
+       opts.retprobe = false;
+       link2 = bpf_program__attach_kprobe_opts(skel->progs.handle_kprobe,
+                                                SYS_NANOSLEEP_KPROBE_NAME, &opts);
+       if (!ASSERT_OK_PTR(link2, "link2"))
+               goto cleanup;
+
+       /* attach two kretprobes */
+       opts.bpf_cookie = 0x10;
+       opts.retprobe = true;
+       retlink1 = bpf_program__attach_kprobe_opts(skel->progs.handle_kretprobe,
+                                                   SYS_NANOSLEEP_KPROBE_NAME, &opts);
+       if (!ASSERT_OK_PTR(retlink1, "retlink1"))
+               goto cleanup;
+
+       opts.bpf_cookie = 0x20;
+       opts.retprobe = true;
+       retlink2 = bpf_program__attach_kprobe_opts(skel->progs.handle_kretprobe,
+                                                   SYS_NANOSLEEP_KPROBE_NAME, &opts);
+       if (!ASSERT_OK_PTR(retlink2, "retlink2"))
+               goto cleanup;
+
+       /* trigger kprobe && kretprobe */
+       usleep(1);
+
+       ASSERT_EQ(skel->bss->kprobe_res, 0x1 | 0x2, "kprobe_res");
+       ASSERT_EQ(skel->bss->kretprobe_res, 0x10 | 0x20, "kretprobe_res");
+
+cleanup:
+       bpf_link__destroy(link1);
+       bpf_link__destroy(link2);
+       bpf_link__destroy(retlink1);
+       bpf_link__destroy(retlink2);
+}
+
+static void uprobe_subtest(struct test_bpf_cookie *skel)
+{
+       DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts);
+       struct bpf_link *link1 = NULL, *link2 = NULL;
+       struct bpf_link *retlink1 = NULL, *retlink2 = NULL;
+       size_t uprobe_offset;
+       ssize_t base_addr;
+
+       base_addr = get_base_addr();
+       uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr);
+
+       /* attach two uprobes */
+       opts.bpf_cookie = 0x100;
+       opts.retprobe = false;
+       link1 = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, 0 /* self pid */,
+                                               "/proc/self/exe", uprobe_offset, &opts);
+       if (!ASSERT_OK_PTR(link1, "link1"))
+               goto cleanup;
+
+       opts.bpf_cookie = 0x200;
+       opts.retprobe = false;
+       link2 = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, -1 /* any pid */,
+                                               "/proc/self/exe", uprobe_offset, &opts);
+       if (!ASSERT_OK_PTR(link2, "link2"))
+               goto cleanup;
+
+       /* attach two uretprobes */
+       opts.bpf_cookie = 0x1000;
+       opts.retprobe = true;
+       retlink1 = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, -1 /* any pid */,
+                                                  "/proc/self/exe", uprobe_offset, &opts);
+       if (!ASSERT_OK_PTR(retlink1, "retlink1"))
+               goto cleanup;
+
+       opts.bpf_cookie = 0x2000;
+       opts.retprobe = true;
+       retlink2 = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, 0 /* self pid */,
+                                                  "/proc/self/exe", uprobe_offset, &opts);
+       if (!ASSERT_OK_PTR(retlink2, "retlink2"))
+               goto cleanup;
+
+       /* trigger uprobe && uretprobe */
+       get_base_addr();
+
+       ASSERT_EQ(skel->bss->uprobe_res, 0x100 | 0x200, "uprobe_res");
+       ASSERT_EQ(skel->bss->uretprobe_res, 0x1000 | 0x2000, "uretprobe_res");
+
+cleanup:
+       bpf_link__destroy(link1);
+       bpf_link__destroy(link2);
+       bpf_link__destroy(retlink1);
+       bpf_link__destroy(retlink2);
+}
+
+static void tp_subtest(struct test_bpf_cookie *skel)
+{
+       DECLARE_LIBBPF_OPTS(bpf_tracepoint_opts, opts);
+       struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL;
+
+       /* attach first tp prog */
+       opts.bpf_cookie = 0x10000;
+       link1 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp1,
+                                                   "syscalls", "sys_enter_nanosleep", &opts);
+       if (!ASSERT_OK_PTR(link1, "link1"))
+               goto cleanup;
+
+       /* attach second tp prog */
+       opts.bpf_cookie = 0x20000;
+       link2 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp2,
+                                                   "syscalls", "sys_enter_nanosleep", &opts);
+       if (!ASSERT_OK_PTR(link2, "link2"))
+               goto cleanup;
+
+       /* trigger tracepoints */
+       usleep(1);
+
+       ASSERT_EQ(skel->bss->tp_res, 0x10000 | 0x20000, "tp_res1");
+
+       /* now we detach first prog and will attach third one, which causes
+        * two internal calls to bpf_prog_array_copy(), shuffling
+        * bpf_prog_array_items around. We test here that we don't lose track
+        * of associated bpf_cookies.
+        */
+       bpf_link__destroy(link1);
+       link1 = NULL;
+       kern_sync_rcu();
+       skel->bss->tp_res = 0;
+
+       /* attach third tp prog */
+       opts.bpf_cookie = 0x40000;
+       link3 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp3,
+                                                   "syscalls", "sys_enter_nanosleep", &opts);
+       if (!ASSERT_OK_PTR(link3, "link3"))
+               goto cleanup;
+
+       /* trigger tracepoints */
+       usleep(1);
+
+       ASSERT_EQ(skel->bss->tp_res, 0x20000 | 0x40000, "tp_res2");
+
+cleanup:
+       bpf_link__destroy(link1);
+       bpf_link__destroy(link2);
+       bpf_link__destroy(link3);
+}
+
+static void burn_cpu(void)
+{
+       volatile int j = 0;
+       cpu_set_t cpu_set;
+       int i, err;
+
+       /* generate some branches on cpu 0 */
+       CPU_ZERO(&cpu_set);
+       CPU_SET(0, &cpu_set);
+       err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+       ASSERT_OK(err, "set_thread_affinity");
+
+       /* spin the loop for a while (random high number) */
+       for (i = 0; i < 1000000; ++i)
+               ++j;
+}
+
+static void pe_subtest(struct test_bpf_cookie *skel)
+{
+       DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, opts);
+       struct bpf_link *link = NULL;
+       struct perf_event_attr attr;
+       int pfd = -1;
+
+       /* create perf event */
+       memset(&attr, 0, sizeof(attr));
+       attr.size = sizeof(attr);
+       attr.type = PERF_TYPE_SOFTWARE;
+       attr.config = PERF_COUNT_SW_CPU_CLOCK;
+       attr.freq = 1;
+       attr.sample_freq = 4000;
+       pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+       if (!ASSERT_GE(pfd, 0, "perf_fd"))
+               goto cleanup;
+
+       opts.bpf_cookie = 0x100000;
+       link = bpf_program__attach_perf_event_opts(skel->progs.handle_pe, pfd, &opts);
+       if (!ASSERT_OK_PTR(link, "link1"))
+               goto cleanup;
+
+       burn_cpu(); /* trigger BPF prog */
+
+       ASSERT_EQ(skel->bss->pe_res, 0x100000, "pe_res1");
+
+       /* prevent bpf_link__destroy() closing pfd itself */
+       bpf_link__disconnect(link);
+       /* close BPF link's FD explicitly */
+       close(bpf_link__fd(link));
+       /* free up memory used by struct bpf_link */
+       bpf_link__destroy(link);
+       link = NULL;
+       kern_sync_rcu();
+       skel->bss->pe_res = 0;
+
+       opts.bpf_cookie = 0x200000;
+       link = bpf_program__attach_perf_event_opts(skel->progs.handle_pe, pfd, &opts);
+       if (!ASSERT_OK_PTR(link, "link2"))
+               goto cleanup;
+
+       burn_cpu(); /* trigger BPF prog */
+
+       ASSERT_EQ(skel->bss->pe_res, 0x200000, "pe_res2");
+
+cleanup:
+       close(pfd);
+       bpf_link__destroy(link);
+}
+
+void test_bpf_cookie(void)
+{
+       struct test_bpf_cookie *skel;
+
+       skel = test_bpf_cookie__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       skel->bss->my_tid = syscall(SYS_gettid);
+
+       if (test__start_subtest("kprobe"))
+               kprobe_subtest(skel);
+       if (test__start_subtest("uprobe"))
+               uprobe_subtest(skel);
+       if (test__start_subtest("tracepoint"))
+               tp_subtest(skel);
+       if (test__start_subtest("perf_event"))
+               pe_subtest(skel);
+
+       test_bpf_cookie__destroy(skel);
+}
index 1f1aade..77ac24b 100644 (file)
@@ -13,6 +13,7 @@
 #include "bpf_iter_tcp6.skel.h"
 #include "bpf_iter_udp4.skel.h"
 #include "bpf_iter_udp6.skel.h"
+#include "bpf_iter_unix.skel.h"
 #include "bpf_iter_test_kern1.skel.h"
 #include "bpf_iter_test_kern2.skel.h"
 #include "bpf_iter_test_kern3.skel.h"
@@ -313,6 +314,19 @@ static void test_udp6(void)
        bpf_iter_udp6__destroy(skel);
 }
 
+static void test_unix(void)
+{
+       struct bpf_iter_unix *skel;
+
+       skel = bpf_iter_unix__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "bpf_iter_unix__open_and_load"))
+               return;
+
+       do_dummy_read(skel->progs.dump_unix);
+
+       bpf_iter_unix__destroy(skel);
+}
+
 /* The expected string is less than 16 bytes */
 static int do_read_with_fd(int iter_fd, const char *expected,
                           bool read_one_char)
@@ -1255,6 +1269,8 @@ void test_bpf_iter(void)
                test_udp4();
        if (test__start_subtest("udp6"))
                test_udp6();
+       if (test__start_subtest("unix"))
+               test_unix();
        if (test__start_subtest("anon"))
                test_anon_iter(false);
        if (test__start_subtest("anon-read-one-char"))
index efe1e97..94e03df 100644 (file)
@@ -4,37 +4,22 @@
 #include <linux/err.h>
 #include <netinet/tcp.h>
 #include <test_progs.h>
+#include "network_helpers.h"
 #include "bpf_dctcp.skel.h"
 #include "bpf_cubic.skel.h"
 #include "bpf_tcp_nogpl.skel.h"
+#include "bpf_dctcp_release.skel.h"
 
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
+#ifndef ENOTSUPP
+#define ENOTSUPP 524
+#endif
+
 static const unsigned int total_bytes = 10 * 1024 * 1024;
-static const struct timeval timeo_sec = { .tv_sec = 10 };
-static const size_t timeo_optlen = sizeof(timeo_sec);
 static int expected_stg = 0xeB9F;
 static int stop, duration;
 
-static int settimeo(int fd)
-{
-       int err;
-
-       err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
-                        timeo_optlen);
-       if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n",
-                 errno))
-               return -1;
-
-       err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
-                        timeo_optlen);
-       if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n",
-                 errno))
-               return -1;
-
-       return 0;
-}
-
 static int settcpca(int fd, const char *tcp_ca)
 {
        int err;
@@ -61,7 +46,7 @@ static void *server(void *arg)
                goto done;
        }
 
-       if (settimeo(fd)) {
+       if (settimeo(fd, 0)) {
                err = -errno;
                goto done;
        }
@@ -114,7 +99,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
        }
 
        if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
-           settimeo(lfd) || settimeo(fd))
+           settimeo(lfd, 0) || settimeo(fd, 0))
                goto done;
 
        /* bind, listen and start server thread to accept */
@@ -267,6 +252,77 @@ static void test_invalid_license(void)
        libbpf_set_print(old_print_fn);
 }
 
+static void test_dctcp_fallback(void)
+{
+       int err, lfd = -1, cli_fd = -1, srv_fd = -1;
+       struct network_helper_opts opts = {
+               .cc = "cubic",
+       };
+       struct bpf_dctcp *dctcp_skel;
+       struct bpf_link *link = NULL;
+       char srv_cc[16];
+       socklen_t cc_len = sizeof(srv_cc);
+
+       dctcp_skel = bpf_dctcp__open();
+       if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel"))
+               return;
+       strcpy(dctcp_skel->rodata->fallback, "cubic");
+       if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load"))
+               goto done;
+
+       link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
+       if (!ASSERT_OK_PTR(link, "dctcp link"))
+               goto done;
+
+       lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+       if (!ASSERT_GE(lfd, 0, "lfd") ||
+           !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp"))
+               goto done;
+
+       cli_fd = connect_to_fd_opts(lfd, &opts);
+       if (!ASSERT_GE(cli_fd, 0, "cli_fd"))
+               goto done;
+
+       srv_fd = accept(lfd, NULL, 0);
+       if (!ASSERT_GE(srv_fd, 0, "srv_fd"))
+               goto done;
+       ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res");
+       ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res");
+
+       err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len);
+       if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)"))
+               goto done;
+       ASSERT_STREQ(srv_cc, "cubic", "srv_fd cc");
+
+done:
+       bpf_link__destroy(link);
+       bpf_dctcp__destroy(dctcp_skel);
+       if (lfd != -1)
+               close(lfd);
+       if (srv_fd != -1)
+               close(srv_fd);
+       if (cli_fd != -1)
+               close(cli_fd);
+}
+
+static void test_rel_setsockopt(void)
+{
+       struct bpf_dctcp_release *rel_skel;
+       libbpf_print_fn_t old_print_fn;
+
+       err_str = "unknown func bpf_setsockopt";
+       found = false;
+
+       old_print_fn = libbpf_set_print(libbpf_debug_print);
+       rel_skel = bpf_dctcp_release__open_and_load();
+       libbpf_set_print(old_print_fn);
+
+       ASSERT_ERR_PTR(rel_skel, "rel_skel");
+       ASSERT_TRUE(found, "expected_err_msg");
+
+       bpf_dctcp_release__destroy(rel_skel);
+}
+
 void test_bpf_tcp_ca(void)
 {
        if (test__start_subtest("dctcp"))
@@ -275,4 +331,8 @@ void test_bpf_tcp_ca(void)
                test_cubic();
        if (test__start_subtest("invalid_license"))
                test_invalid_license();
+       if (test__start_subtest("dctcp_fallback"))
+               test_dctcp_fallback();
+       if (test__start_subtest("rel_setsockopt"))
+               test_rel_setsockopt();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_module.c b/tools/testing/selftests/bpf/prog_tests/btf_module.c
new file mode 100644 (file)
index 0000000..2239d1f
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021 Hengqi Chen */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+static const char *module_name = "bpf_testmod";
+static const char *symbol_name = "bpf_testmod_test_read";
+
+void test_btf_module()
+{
+       struct btf *vmlinux_btf, *module_btf;
+       __s32 type_id;
+
+       if (!env.has_testmod) {
+               test__skip();
+               return;
+       }
+
+       vmlinux_btf = btf__load_vmlinux_btf();
+       if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF"))
+               return;
+
+       module_btf = btf__load_module_btf(module_name, vmlinux_btf);
+       if (!ASSERT_OK_PTR(module_btf, "could not load module BTF"))
+               goto cleanup;
+
+       type_id = btf__find_by_name(module_btf, symbol_name);
+       ASSERT_GT(type_id, 0, "func not found");
+
+cleanup:
+       btf__free(module_btf);
+       btf__free(vmlinux_btf);
+}
index 30a7b9b..9611f2b 100644 (file)
@@ -44,7 +44,7 @@ static void test_subprog(void)
        ASSERT_OK(err, "bpf_prog_test_run(test1)");
        ASSERT_EQ(retval, 10, "test1-retval");
        ASSERT_NEQ(skel->data->active_res, -1, "active_res");
-       ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state");
+       ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res");
 
        kfunc_call_test_subprog__destroy(skel);
 }
index 67bebd3..cf3acfa 100644 (file)
@@ -6,6 +6,7 @@
 #include <bpf/btf.h>
 #include "test_ksyms_btf.skel.h"
 #include "test_ksyms_btf_null_check.skel.h"
+#include "test_ksyms_weak.skel.h"
 
 static int duration;
 
@@ -81,6 +82,33 @@ static void test_null_check(void)
        test_ksyms_btf_null_check__destroy(skel);
 }
 
+static void test_weak_syms(void)
+{
+       struct test_ksyms_weak *skel;
+       struct test_ksyms_weak__data *data;
+       int err;
+
+       skel = test_ksyms_weak__open_and_load();
+       if (CHECK(!skel, "test_ksyms_weak__open_and_load", "failed\n"))
+               return;
+
+       err = test_ksyms_weak__attach(skel);
+       if (CHECK(err, "test_ksyms_weak__attach", "skeleton attach failed: %d\n", err))
+               goto cleanup;
+
+       /* trigger tracepoint */
+       usleep(1);
+
+       data = skel->data;
+       ASSERT_EQ(data->out__existing_typed, 0, "existing typed ksym");
+       ASSERT_NEQ(data->out__existing_typeless, -1, "existing typeless ksym");
+       ASSERT_EQ(data->out__non_existent_typeless, 0, "nonexistent typeless ksym");
+       ASSERT_EQ(data->out__non_existent_typed, 0, "nonexistent typed ksym");
+
+cleanup:
+       test_ksyms_weak__destroy(skel);
+}
+
 void test_ksyms_btf(void)
 {
        int percpu_datasec;
@@ -105,4 +133,7 @@ void test_ksyms_btf(void)
 
        if (test__start_subtest("null_check"))
                test_null_check();
+
+       if (test__start_subtest("weak_ksyms"))
+               test_weak_syms();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c
new file mode 100644 (file)
index 0000000..71d8f3b
--- /dev/null
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "netns_cookie_prog.skel.h"
+#include "network_helpers.h"
+
+#ifndef SO_NETNS_COOKIE
+#define SO_NETNS_COOKIE 71
+#endif
+
+static int duration;
+
+void test_netns_cookie(void)
+{
+       int server_fd = -1, client_fd = -1, cgroup_fd = -1;
+       int err, val, ret, map, verdict;
+       struct netns_cookie_prog *skel;
+       uint64_t cookie_expected_value;
+       socklen_t vallen = sizeof(cookie_expected_value);
+       static const char send_msg[] = "message";
+
+       skel = netns_cookie_prog__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       cgroup_fd = test__join_cgroup("/netns_cookie");
+       if (CHECK(cgroup_fd < 0, "join_cgroup", "cgroup creation failed\n"))
+               goto done;
+
+       skel->links.get_netns_cookie_sockops = bpf_program__attach_cgroup(
+               skel->progs.get_netns_cookie_sockops, cgroup_fd);
+       if (!ASSERT_OK_PTR(skel->links.get_netns_cookie_sockops, "prog_attach"))
+               goto done;
+
+       verdict = bpf_program__fd(skel->progs.get_netns_cookie_sk_msg);
+       map = bpf_map__fd(skel->maps.sock_map);
+       err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0);
+       if (!ASSERT_OK(err, "prog_attach"))
+               goto done;
+
+       server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+       if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno))
+               goto done;
+
+       client_fd = connect_to_fd(server_fd, 0);
+       if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno))
+               goto done;
+
+       ret = send(client_fd, send_msg, sizeof(send_msg), 0);
+       if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", ret))
+               goto done;
+
+       err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sockops_netns_cookies),
+                                 &client_fd, &val);
+       if (!ASSERT_OK(err, "map_lookup(sockops_netns_cookies)"))
+               goto done;
+
+       err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE,
+                        &cookie_expected_value, &vallen);
+       if (!ASSERT_OK(err, "getsockopt"))
+               goto done;
+
+       ASSERT_EQ(val, cookie_expected_value, "cookie_value");
+
+       err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_msg_netns_cookies),
+                                 &client_fd, &val);
+       if (!ASSERT_OK(err, "map_lookup(sk_msg_netns_cookies)"))
+               goto done;
+
+       ASSERT_EQ(val, cookie_expected_value, "cookie_value");
+
+done:
+       if (server_fd != -1)
+               close(server_fd);
+       if (client_fd != -1)
+               close(client_fd);
+       if (cgroup_fd != -1)
+               close(cgroup_fd);
+       netns_cookie_prog__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_link.c b/tools/testing/selftests/bpf/prog_tests/perf_link.c
new file mode 100644 (file)
index 0000000..b1abd0c
--- /dev/null
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <test_progs.h>
+#include "test_perf_link.skel.h"
+
+static void burn_cpu(void)
+{
+       volatile int j = 0;
+       cpu_set_t cpu_set;
+       int i, err;
+
+       /* generate some branches on cpu 0 */
+       CPU_ZERO(&cpu_set);
+       CPU_SET(0, &cpu_set);
+       err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+       ASSERT_OK(err, "set_thread_affinity");
+
+       /* spin the loop for a while (random high number) */
+       for (i = 0; i < 1000000; ++i)
+               ++j;
+}
+
+void test_perf_link(void)
+{
+       struct test_perf_link *skel = NULL;
+       struct perf_event_attr attr;
+       int pfd = -1, link_fd = -1, err;
+       int run_cnt_before, run_cnt_after;
+       struct bpf_link_info info;
+       __u32 info_len = sizeof(info);
+
+       /* create perf event */
+       memset(&attr, 0, sizeof(attr));
+       attr.size = sizeof(attr);
+       attr.type = PERF_TYPE_SOFTWARE;
+       attr.config = PERF_COUNT_SW_CPU_CLOCK;
+       attr.freq = 1;
+       attr.sample_freq = 4000;
+       pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+       if (!ASSERT_GE(pfd, 0, "perf_fd"))
+               goto cleanup;
+
+       skel = test_perf_link__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_load"))
+               goto cleanup;
+
+       link_fd = bpf_link_create(bpf_program__fd(skel->progs.handler), pfd,
+                                 BPF_PERF_EVENT, NULL);
+       if (!ASSERT_GE(link_fd, 0, "link_fd"))
+               goto cleanup;
+
+       memset(&info, 0, sizeof(info));
+       err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
+       if (!ASSERT_OK(err, "link_get_info"))
+               goto cleanup;
+
+       ASSERT_EQ(info.type, BPF_LINK_TYPE_PERF_EVENT, "link_type");
+       ASSERT_GT(info.id, 0, "link_id");
+       ASSERT_GT(info.prog_id, 0, "link_prog_id");
+
+       /* ensure we get at least one perf_event prog execution */
+       burn_cpu();
+       ASSERT_GT(skel->bss->run_cnt, 0, "run_cnt");
+
+       /* perf_event is still active, but we close link and BPF program
+        * shouldn't be executed anymore
+        */
+       close(link_fd);
+       link_fd = -1;
+
+       /* make sure there are no stragglers */
+       kern_sync_rcu();
+
+       run_cnt_before = skel->bss->run_cnt;
+       burn_cpu();
+       run_cnt_after = skel->bss->run_cnt;
+
+       ASSERT_EQ(run_cnt_before, run_cnt_after, "run_cnt_before_after");
+
+cleanup:
+       if (link_fd >= 0)
+               close(link_fd);
+       if (pfd >= 0)
+               close(pfd);
+       test_perf_link__destroy(skel);
+}
index 023cc53..776916b 100644 (file)
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include "test_send_signal_kern.skel.h"
 
 int sigusr1_received = 0;
@@ -10,29 +12,25 @@ static void sigusr1_handler(int signum)
 }
 
 static void test_send_signal_common(struct perf_event_attr *attr,
-                                   bool signal_thread,
-                                   const char *test_name)
+                                   bool signal_thread)
 {
        struct test_send_signal_kern *skel;
        int pipe_c2p[2], pipe_p2c[2];
        int err = -1, pmu_fd = -1;
-       __u32 duration = 0;
        char buf[256];
        pid_t pid;
 
-       if (CHECK(pipe(pipe_c2p), test_name,
-                 "pipe pipe_c2p error: %s\n", strerror(errno)))
+       if (!ASSERT_OK(pipe(pipe_c2p), "pipe_c2p"))
                return;
 
-       if (CHECK(pipe(pipe_p2c), test_name,
-                 "pipe pipe_p2c error: %s\n", strerror(errno))) {
+       if (!ASSERT_OK(pipe(pipe_p2c), "pipe_p2c")) {
                close(pipe_c2p[0]);
                close(pipe_c2p[1]);
                return;
        }
 
        pid = fork();
-       if (CHECK(pid < 0, test_name, "fork error: %s\n", strerror(errno))) {
+       if (!ASSERT_GE(pid, 0, "fork")) {
                close(pipe_c2p[0]);
                close(pipe_c2p[1]);
                close(pipe_p2c[0]);
@@ -41,26 +39,40 @@ static void test_send_signal_common(struct perf_event_attr *attr,
        }
 
        if (pid == 0) {
+               int old_prio;
+
                /* install signal handler and notify parent */
                signal(SIGUSR1, sigusr1_handler);
 
                close(pipe_c2p[0]); /* close read */
                close(pipe_p2c[1]); /* close write */
 
+               /* boost with a high priority so we got a higher chance
+                * that if an interrupt happens, the underlying task
+                * is this process.
+                */
+               errno = 0;
+               old_prio = getpriority(PRIO_PROCESS, 0);
+               ASSERT_OK(errno, "getpriority");
+               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+
                /* notify parent signal handler is installed */
-               CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+               ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
                /* make sure parent enabled bpf program to send_signal */
-               CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+               ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
 
                /* wait a little for signal handler */
                sleep(1);
 
                buf[0] = sigusr1_received ? '2' : '0';
-               CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+               ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
                /* wait for parent notification and exit */
-               CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+               ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
+
+               /* restore the old priority */
+               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
 
                close(pipe_c2p[1]);
                close(pipe_p2c[0]);
@@ -71,20 +83,19 @@ static void test_send_signal_common(struct perf_event_attr *attr,
        close(pipe_p2c[0]); /* close read */
 
        skel = test_send_signal_kern__open_and_load();
-       if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n"))
+       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
                goto skel_open_load_failure;
 
        if (!attr) {
                err = test_send_signal_kern__attach(skel);
-               if (CHECK(err, "skel_attach", "skeleton attach failed\n")) {
+               if (!ASSERT_OK(err, "skel_attach")) {
                        err = -1;
                        goto destroy_skel;
                }
        } else {
                pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1,
                                 -1 /* group id */, 0 /* flags */);
-               if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n",
-                       strerror(errno))) {
+               if (!ASSERT_GE(pmu_fd, 0, "perf_event_open")) {
                        err = -1;
                        goto destroy_skel;
                }
@@ -96,7 +107,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
        }
 
        /* wait until child signal handler installed */
-       CHECK(read(pipe_c2p[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+       ASSERT_EQ(read(pipe_c2p[0], buf, 1), 1, "pipe_read");
 
        /* trigger the bpf send_signal */
        skel->bss->pid = pid;
@@ -104,21 +115,21 @@ static void test_send_signal_common(struct perf_event_attr *attr,
        skel->bss->signal_thread = signal_thread;
 
        /* notify child that bpf program can send_signal now */
-       CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+       ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
        /* wait for result */
        err = read(pipe_c2p[0], buf, 1);
-       if (CHECK(err < 0, test_name, "reading pipe error: %s\n", strerror(errno)))
+       if (!ASSERT_GE(err, 0, "reading pipe"))
                goto disable_pmu;
-       if (CHECK(err == 0, test_name, "reading pipe error: size 0\n")) {
+       if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) {
                err = -1;
                goto disable_pmu;
        }
 
-       CHECK(buf[0] != '2', test_name, "incorrect result\n");
+       ASSERT_EQ(buf[0], '2', "incorrect result");
 
        /* notify child safe to exit */
-       CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+       ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
 disable_pmu:
        close(pmu_fd);
@@ -132,7 +143,7 @@ skel_open_load_failure:
 
 static void test_send_signal_tracepoint(bool signal_thread)
 {
-       test_send_signal_common(NULL, signal_thread, "tracepoint");
+       test_send_signal_common(NULL, signal_thread);
 }
 
 static void test_send_signal_perf(bool signal_thread)
@@ -143,7 +154,7 @@ static void test_send_signal_perf(bool signal_thread)
                .config = PERF_COUNT_SW_CPU_CLOCK,
        };
 
-       test_send_signal_common(&attr, signal_thread, "perf_sw_event");
+       test_send_signal_common(&attr, signal_thread);
 }
 
 static void test_send_signal_nmi(bool signal_thread)
@@ -172,7 +183,7 @@ static void test_send_signal_nmi(bool signal_thread)
                close(pmu_fd);
        }
 
-       test_send_signal_common(&attr, signal_thread, "perf_hw_event");
+       test_send_signal_common(&attr, signal_thread);
 }
 
 void test_send_signal(void)
index dffbcaa..8fd1b4b 100644 (file)
@@ -19,7 +19,7 @@
 #define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 "
 #define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr")
 
-#define EXP_STR_OUT  "str1 longstr"
+#define EXP_STR_OUT  "str1         a  b c      d e longstr"
 #define EXP_STR_RET  sizeof(EXP_STR_OUT)
 
 #define EXP_OVER_OUT "%over"
@@ -114,6 +114,8 @@ void test_snprintf_negative(void)
        ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3");
        ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4");
        ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5");
+       ASSERT_ERR(load_single_snprintf("%lc"), "invalid specifier 6");
+       ASSERT_ERR(load_single_snprintf("%llc"), "invalid specifier 7");
        ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character");
        ASSERT_ERR(load_single_snprintf("\x1"), "non printable character");
 }
index a9f1bf9..5c59790 100644 (file)
@@ -949,6 +949,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
        int err, n;
        u32 key;
        char b;
+       int retries = 100;
 
        zero_verdict_count(verd_mapfd);
 
@@ -1001,10 +1002,15 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
                goto close_peer1;
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
-
+again:
        n = read(c0, &b, 1);
-       if (n < 0)
+       if (n < 0) {
+               if (errno == EAGAIN && retries--) {
+                       usleep(1000);
+                       goto again;
+               }
                FAIL_ERRNO("%s: read", log_prefix);
+       }
        if (n == 0)
                FAIL("%s: incomplete read", log_prefix);
 
@@ -1603,8 +1609,10 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
 again:
        n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
        if (n < 0) {
-               if (errno == EAGAIN && retries--)
+               if (errno == EAGAIN && retries--) {
+                       usleep(1000);
                        goto again;
+               }
                FAIL_ERRNO("%s: read", log_prefix);
        }
        if (n == 0)
@@ -1692,14 +1700,14 @@ static void test_reuseport(struct test_sockmap_listen *skel,
        }
 }
 
-static int udp_socketpair(int family, int *s, int *c)
+static int inet_socketpair(int family, int type, int *s, int *c)
 {
        struct sockaddr_storage addr;
        socklen_t len;
        int p0, c0;
        int err;
 
-       p0 = socket_loopback(family, SOCK_DGRAM | SOCK_NONBLOCK);
+       p0 = socket_loopback(family, type | SOCK_NONBLOCK);
        if (p0 < 0)
                return p0;
 
@@ -1708,7 +1716,7 @@ static int udp_socketpair(int family, int *s, int *c)
        if (err)
                goto close_peer0;
 
-       c0 = xsocket(family, SOCK_DGRAM | SOCK_NONBLOCK, 0);
+       c0 = xsocket(family, type | SOCK_NONBLOCK, 0);
        if (c0 < 0) {
                err = c0;
                goto close_peer0;
@@ -1747,10 +1755,10 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
 
        zero_verdict_count(verd_mapfd);
 
-       err = udp_socketpair(family, &p0, &c0);
+       err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0);
        if (err)
                return;
-       err = udp_socketpair(family, &p1, &c1);
+       err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1);
        if (err)
                goto close_cli0;
 
@@ -1776,8 +1784,10 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
 again:
        n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
        if (n < 0) {
-               if (errno == EAGAIN && retries--)
+               if (errno == EAGAIN && retries--) {
+                       usleep(1000);
                        goto again;
+               }
                FAIL_ERRNO("%s: read", log_prefix);
        }
        if (n == 0)
@@ -1825,7 +1835,7 @@ static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map
        udp_skb_redir_to_connected(skel, map, family);
 }
 
-static void udp_unix_redir_to_connected(int family, int sock_mapfd,
+static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
                                        int verd_mapfd, enum redir_mode mode)
 {
        const char *log_prefix = redir_mode_str(mode);
@@ -1843,7 +1853,7 @@ static void udp_unix_redir_to_connected(int family, int sock_mapfd,
                return;
        c0 = sfd[0], p0 = sfd[1];
 
-       err = udp_socketpair(family, &p1, &c1);
+       err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1);
        if (err)
                goto close;
 
@@ -1869,8 +1879,10 @@ static void udp_unix_redir_to_connected(int family, int sock_mapfd,
 again:
        n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
        if (n < 0) {
-               if (errno == EAGAIN && retries--)
+               if (errno == EAGAIN && retries--) {
+                       usleep(1000);
                        goto again;
+               }
                FAIL_ERRNO("%s: read", log_prefix);
        }
        if (n == 0)
@@ -1884,7 +1896,7 @@ close:
        xclose(p0);
 }
 
-static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel,
+static void inet_unix_skb_redir_to_connected(struct test_sockmap_listen *skel,
                                            struct bpf_map *inner_map, int family)
 {
        int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
@@ -1897,14 +1909,20 @@ static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel,
                return;
 
        skel->bss->test_ingress = false;
-       udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS);
+       inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+                                   REDIR_EGRESS);
+       inet_unix_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map,
+                                   REDIR_EGRESS);
        skel->bss->test_ingress = true;
-       udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS);
+       inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+                                   REDIR_INGRESS);
+       inet_unix_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map,
+                                   REDIR_INGRESS);
 
        xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
 }
 
-static void unix_udp_redir_to_connected(int family, int sock_mapfd,
+static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
                                        int verd_mapfd, enum redir_mode mode)
 {
        const char *log_prefix = redir_mode_str(mode);
@@ -1914,10 +1932,11 @@ static void unix_udp_redir_to_connected(int family, int sock_mapfd,
        int sfd[2];
        u32 key;
        char b;
+       int retries = 100;
 
        zero_verdict_count(verd_mapfd);
 
-       err = udp_socketpair(family, &p0, &c0);
+       err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0);
        if (err)
                return;
 
@@ -1944,9 +1963,15 @@ static void unix_udp_redir_to_connected(int family, int sock_mapfd,
        if (pass != 1)
                FAIL("%s: want pass count 1, have %d", log_prefix, pass);
 
+again:
        n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
-       if (n < 0)
+       if (n < 0) {
+               if (errno == EAGAIN && retries--) {
+                       usleep(1000);
+                       goto again;
+               }
                FAIL_ERRNO("%s: read", log_prefix);
+       }
        if (n == 0)
                FAIL("%s: incomplete read", log_prefix);
 
@@ -1959,7 +1984,7 @@ close_cli0:
 
 }
 
-static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel,
+static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel,
                                            struct bpf_map *inner_map, int family)
 {
        int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
@@ -1972,9 +1997,15 @@ static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel,
                return;
 
        skel->bss->test_ingress = false;
-       unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS);
+       unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+                                    REDIR_EGRESS);
+       unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map,
+                                    REDIR_EGRESS);
        skel->bss->test_ingress = true;
-       unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS);
+       unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+                                    REDIR_INGRESS);
+       unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map,
+                                    REDIR_INGRESS);
 
        xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
 }
@@ -1990,8 +2021,8 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map
        snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
        if (!test__start_subtest(s))
                return;
-       udp_unix_skb_redir_to_connected(skel, map, family);
-       unix_udp_skb_redir_to_connected(skel, map, family);
+       inet_unix_skb_redir_to_connected(skel, map, family);
+       unix_inet_skb_redir_to_connected(skel, map, family);
 }
 
 static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
@@ -2020,11 +2051,13 @@ void test_sockmap_listen(void)
        run_tests(skel, skel->maps.sock_map, AF_INET);
        run_tests(skel, skel->maps.sock_map, AF_INET6);
        test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM);
+       test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM);
 
        skel->bss->test_sockmap = false;
        run_tests(skel, skel->maps.sock_hash, AF_INET);
        run_tests(skel, skel->maps.sock_hash, AF_INET6);
        test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM);
+       test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM);
 
        test_sockmap_listen__destroy(skel);
 }
index ec281b0..86f9768 100644 (file)
@@ -195,8 +195,10 @@ static void run_test(int cgroup_fd)
 
        pthread_mutex_lock(&server_started_mtx);
        if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread,
-                                     (void *)&server_fd)))
+                                     (void *)&server_fd))) {
+               pthread_mutex_unlock(&server_started_mtx);
                goto close_server_fd;
+       }
        pthread_cond_wait(&server_started, &server_started_mtx);
        pthread_mutex_unlock(&server_started_mtx);
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c
new file mode 100644 (file)
index 0000000..6b53b3c
--- /dev/null
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include <netinet/tcp.h>
+#include "sockopt_qos_to_cc.skel.h"
+
+static void run_setsockopt_test(int cg_fd, int sock_fd)
+{
+       socklen_t optlen;
+       char cc[16]; /* TCP_CA_NAME_MAX */
+       int buf;
+       int err = -1;
+
+       buf = 0x2D;
+       err = setsockopt(sock_fd, SOL_IPV6, IPV6_TCLASS, &buf, sizeof(buf));
+       if (!ASSERT_OK(err, "setsockopt(sock_fd, IPV6_TCLASS)"))
+               return;
+
+       /* Verify the setsockopt cc change */
+       optlen = sizeof(cc);
+       err = getsockopt(sock_fd, SOL_TCP, TCP_CONGESTION, cc, &optlen);
+       if (!ASSERT_OK(err, "getsockopt(sock_fd, TCP_CONGESTION)"))
+               return;
+
+       if (!ASSERT_STREQ(cc, "reno", "getsockopt(sock_fd, TCP_CONGESTION)"))
+               return;
+}
+
+void test_sockopt_qos_to_cc(void)
+{
+       struct sockopt_qos_to_cc *skel;
+       char cc_cubic[16] = "cubic"; /* TCP_CA_NAME_MAX */
+       int cg_fd = -1;
+       int sock_fd = -1;
+       int err;
+
+       cg_fd = test__join_cgroup("/sockopt_qos_to_cc");
+       if (!ASSERT_GE(cg_fd, 0, "cg-join(sockopt_qos_to_cc)"))
+               return;
+
+       skel = sockopt_qos_to_cc__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel"))
+               goto done;
+
+       sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
+       if (!ASSERT_GE(sock_fd, 0, "v6 socket open"))
+               goto done;
+
+       err = setsockopt(sock_fd, SOL_TCP, TCP_CONGESTION, &cc_cubic,
+                        sizeof(cc_cubic));
+       if (!ASSERT_OK(err, "setsockopt(sock_fd, TCP_CONGESTION)"))
+               goto done;
+
+       skel->links.sockopt_qos_to_cc =
+               bpf_program__attach_cgroup(skel->progs.sockopt_qos_to_cc,
+                                          cg_fd);
+       if (!ASSERT_OK_PTR(skel->links.sockopt_qos_to_cc,
+                          "prog_attach(sockopt_qos_to_cc)"))
+               goto done;
+
+       run_setsockopt_test(cg_fd, sock_fd);
+
+done:
+       if (sock_fd != -1)
+               close(sock_fd);
+       if (cg_fd != -1)
+               close(cg_fd);
+       /* destroy can take null and error pointer */
+       sockopt_qos_to_cc__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c
new file mode 100644 (file)
index 0000000..53f0e0f
--- /dev/null
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include <linux/ptrace.h>
+#include "test_task_pt_regs.skel.h"
+
+void test_task_pt_regs(void)
+{
+       struct test_task_pt_regs *skel;
+       struct bpf_link *uprobe_link;
+       size_t uprobe_offset;
+       ssize_t base_addr;
+       bool match;
+
+       base_addr = get_base_addr();
+       if (!ASSERT_GT(base_addr, 0, "get_base_addr"))
+               return;
+       uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr);
+
+       skel = test_task_pt_regs__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+       if (!ASSERT_OK_PTR(skel->bss, "check_bss"))
+               goto cleanup;
+
+       uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe,
+                                                false /* retprobe */,
+                                                0 /* self pid */,
+                                                "/proc/self/exe",
+                                                uprobe_offset);
+       if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe"))
+               goto cleanup;
+       skel->links.handle_uprobe = uprobe_link;
+
+       /* trigger & validate uprobe */
+       get_base_addr();
+
+       if (!ASSERT_EQ(skel->bss->uprobe_res, 1, "check_uprobe_res"))
+               goto cleanup;
+
+       match = !memcmp(&skel->bss->current_regs, &skel->bss->ctx_regs,
+                       sizeof(skel->bss->current_regs));
+       ASSERT_TRUE(match, "check_regs_match");
+
+cleanup:
+       test_task_pt_regs__destroy(skel);
+}
index f5acbcb..ced8f6c 100644 (file)
@@ -23,8 +23,12 @@ static int timer_mim(struct timer_mim *timer_skel)
 
        /* check that timer_cb[12] are incrementing 'cnt' */
        cnt1 = READ_ONCE(timer_skel->bss->cnt);
-       usleep(200); /* 100 times more than interval */
-       cnt2 = READ_ONCE(timer_skel->bss->cnt);
+       for (int i = 0; i < 100; i++) {
+               cnt2 = READ_ONCE(timer_skel->bss->cnt);
+               if (cnt2 != cnt1)
+                       break;
+               usleep(200); /* 100 times more than interval */
+       }
        ASSERT_GT(cnt2, cnt1, "cnt");
 
        ASSERT_EQ(timer_skel->bss->err, 0, "err");
@@ -37,8 +41,12 @@ static int timer_mim(struct timer_mim *timer_skel)
 
        /* check that timer_cb[12] are no longer running */
        cnt1 = READ_ONCE(timer_skel->bss->cnt);
-       usleep(200);
-       cnt2 = READ_ONCE(timer_skel->bss->cnt);
+       for (int i = 0; i < 100; i++) {
+               usleep(200); /* 100 times more than interval */
+               cnt2 = READ_ONCE(timer_skel->bss->cnt);
+               if (cnt2 == cnt1)
+                       break;
+       }
        ASSERT_EQ(cnt2, cnt1, "cnt");
 
        return 0;
index 6b186b4..370d220 100644 (file)
@@ -493,20 +493,20 @@ void test_xdp_bonding(void)
                           "xdp_redirect_multi_kern__open_and_load"))
                goto out;
 
-       if (!test__start_subtest("xdp_bonding_attach"))
+       if (test__start_subtest("xdp_bonding_attach"))
                test_xdp_bonding_attach(&skeletons);
 
        for (i = 0; i < ARRAY_SIZE(bond_test_cases); i++) {
                struct bond_test_case *test_case = &bond_test_cases[i];
 
-               if (!test__start_subtest(test_case->name))
+               if (test__start_subtest(test_case->name))
                        test_xdp_bonding_with_mode(
                                &skeletons,
                                test_case->mode,
                                test_case->xmit_policy);
        }
 
-       if (!test__start_subtest("xdp_bonding_redirect_multi"))
+       if (test__start_subtest("xdp_bonding_redirect_multi"))
                test_xdp_bonding_redirect_multi(&skeletons);
 
 out:
index fd42247..9573be6 100644 (file)
 
 char _license[] SEC("license") = "GPL";
 
+volatile const char fallback[TCP_CA_NAME_MAX];
+const char bpf_dctcp[] = "bpf_dctcp";
+const char tcp_cdg[] = "cdg";
+char cc_res[TCP_CA_NAME_MAX];
+int tcp_cdg_res = 0;
 int stg_result = 0;
 
 struct {
@@ -57,6 +62,26 @@ void BPF_PROG(dctcp_init, struct sock *sk)
        struct dctcp *ca = inet_csk_ca(sk);
        int *stg;
 
+       if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
+               /* Switch to fallback */
+               bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+                              (void *)fallback, sizeof(fallback));
+               /* Switch back to myself which the bpf trampoline
+                * stopped calling dctcp_init recursively.
+                */
+               bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+                              (void *)bpf_dctcp, sizeof(bpf_dctcp));
+               /* Switch back to fallback */
+               bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+                              (void *)fallback, sizeof(fallback));
+               /* Expecting -ENOTSUPP for tcp_cdg_res */
+               tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+                                            (void *)tcp_cdg, sizeof(tcp_cdg));
+               bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+                              (void *)cc_res, sizeof(cc_res));
+               return;
+       }
+
        ca->prior_rcv_nxt = tp->rcv_nxt;
        ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
        ca->loss_cwnd = 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
new file mode 100644 (file)
index 0000000..d836f7c
--- /dev/null
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+const char cubic[] = "cubic";
+
+void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk)
+{
+       bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+                      (void *)cubic, sizeof(cubic));
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops dctcp_rel = {
+       .release        = (void *)dctcp_nouse_release,
+       .name           = "bpf_dctcp_rel",
+};
index 3d83b18..8cfaeba 100644 (file)
@@ -12,6 +12,7 @@
 #define tcp6_sock tcp6_sock___not_used
 #define bpf_iter__udp bpf_iter__udp___not_used
 #define udp6_sock udp6_sock___not_used
+#define bpf_iter__unix bpf_iter__unix___not_used
 #define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used
 #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used
 #define bpf_iter__sockmap bpf_iter__sockmap___not_used
@@ -32,6 +33,7 @@
 #undef tcp6_sock
 #undef bpf_iter__udp
 #undef udp6_sock
+#undef bpf_iter__unix
 #undef bpf_iter__bpf_map_elem
 #undef bpf_iter__bpf_sk_storage_map
 #undef bpf_iter__sockmap
@@ -103,6 +105,12 @@ struct udp6_sock {
        struct ipv6_pinfo inet6;
 } __attribute__((preserve_access_index));
 
+struct bpf_iter__unix {
+       struct bpf_iter_meta *meta;
+       struct unix_sock *unix_sk;
+       uid_t uid;
+} __attribute__((preserve_access_index));
+
 struct bpf_iter__bpf_map_elem {
        struct bpf_iter_meta *meta;
        struct bpf_map *map;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
new file mode 100644 (file)
index 0000000..9442390
--- /dev/null
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+static long sock_i_ino(const struct sock *sk)
+{
+       const struct socket *sk_socket = sk->sk_socket;
+       const struct inode *inode;
+       unsigned long ino;
+
+       if (!sk_socket)
+               return 0;
+
+       inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
+       bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
+       return ino;
+}
+
+SEC("iter/unix")
+int dump_unix(struct bpf_iter__unix *ctx)
+{
+       struct unix_sock *unix_sk = ctx->unix_sk;
+       struct sock *sk = (struct sock *)unix_sk;
+       struct seq_file *seq;
+       __u32 seq_num;
+
+       if (!unix_sk)
+               return 0;
+
+       seq = ctx->meta->seq;
+       seq_num = ctx->meta->seq_num;
+       if (seq_num == 0)
+               BPF_SEQ_PRINTF(seq, "Num               RefCount Protocol Flags    Type St    Inode Path\n");
+
+       BPF_SEQ_PRINTF(seq, "%pK: %08X %08X %08X %04X %02X %8lu",
+                      unix_sk,
+                      sk->sk_refcnt.refs.counter,
+                      0,
+                      sk->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
+                      sk->sk_type,
+                      sk->sk_socket ?
+                      (sk->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
+                      (sk->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
+                      sock_i_ino(sk));
+
+       if (unix_sk->addr) {
+               if (!UNIX_ABSTRACT(unix_sk)) {
+                       BPF_SEQ_PRINTF(seq, " %s", unix_sk->addr->name->sun_path);
+               } else {
+                       /* The name of the abstract UNIX domain socket starts
+                        * with '\0' and can contain '\0'.  The null bytes
+                        * should be escaped as done in unix_seq_show().
+                        */
+                       __u64 i, len;
+
+                       len = unix_sk->addr->len - sizeof(short);
+
+                       BPF_SEQ_PRINTF(seq, " @");
+
+                       for (i = 1; i < len; i++) {
+                               /* unix_mkname() tests this upper bound. */
+                               if (i >= sizeof(struct sockaddr_un))
+                                       break;
+
+                               BPF_SEQ_PRINTF(seq, "%c",
+                                              unix_sk->addr->name->sun_path[i] ?:
+                                              '@');
+                       }
+               }
+       }
+
+       BPF_SEQ_PRINTF(seq, "\n");
+
+       return 0;
+}
index 3af0998..eef5646 100644 (file)
@@ -5,6 +5,10 @@
 #define AF_INET                        2
 #define AF_INET6               10
 
+#define __SO_ACCEPTCON         (1 << 16)
+#define UNIX_HASH_SIZE         256
+#define UNIX_ABSTRACT(unix_sk) (unix_sk->addr->hash < UNIX_HASH_SIZE)
+
 #define SOL_TCP                        6
 #define TCP_CONGESTION         13
 #define TCP_CA_NAME_MAX                16
index b2dcb7d..5fbd9e2 100644 (file)
@@ -9,7 +9,7 @@ extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
                                  __u32 c, __u64 d) __ksym;
 extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
 int active_res = -1;
-int sk_state = -1;
+int sk_state_res = -1;
 
 int __noinline f1(struct __sk_buff *skb)
 {
@@ -28,7 +28,7 @@ int __noinline f1(struct __sk_buff *skb)
        if (active)
                active_res = *active;
 
-       sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state;
+       sk_state_res = bpf_kfunc_call_test3((struct sock *)sk)->sk_state;
 
        return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4);
 }
diff --git a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c
new file mode 100644 (file)
index 0000000..aeff3a4
--- /dev/null
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+#define AF_INET6 10
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, int);
+} sockops_netns_cookies SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, int);
+} sk_msg_netns_cookies SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SOCKMAP);
+       __uint(max_entries, 2);
+       __type(key, __u32);
+       __type(value, __u64);
+} sock_map SEC(".maps");
+
+SEC("sockops")
+int get_netns_cookie_sockops(struct bpf_sock_ops *ctx)
+{
+       struct bpf_sock *sk = ctx->sk;
+       int *cookie;
+       __u32 key = 0;
+
+       if (ctx->family != AF_INET6)
+               return 1;
+
+       if (!sk)
+               return 1;
+
+       switch (ctx->op) {
+       case BPF_SOCK_OPS_TCP_CONNECT_CB:
+               cookie = bpf_sk_storage_get(&sockops_netns_cookies, sk, 0,
+                                           BPF_SK_STORAGE_GET_F_CREATE);
+               if (!cookie)
+                       return 1;
+
+               *cookie = bpf_get_netns_cookie(ctx);
+               break;
+       case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+               bpf_sock_map_update(ctx, &sock_map, &key, BPF_NOEXIST);
+               break;
+       default:
+               break;
+       }
+
+       return 1;
+}
+
+SEC("sk_msg")
+int get_netns_cookie_sk_msg(struct sk_msg_md *msg)
+{
+       struct bpf_sock *sk = msg->sk;
+       int *cookie;
+
+       if (msg->family != AF_INET6)
+               return 1;
+
+       if (!sk)
+               return 1;
+
+       cookie = bpf_sk_storage_get(&sk_msg_netns_cookies, sk, 0,
+                                   BPF_SK_STORAGE_GET_F_CREATE);
+       if (!cookie)
+               return 1;
+
+       *cookie = bpf_get_netns_cookie(msg);
+
+       return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
new file mode 100644 (file)
index 0000000..1bce83b
--- /dev/null
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <string.h>
+#include <linux/tcp.h>
+#include <netinet/in.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("cgroup/setsockopt")
+int sockopt_qos_to_cc(struct bpf_sockopt *ctx)
+{
+       void *optval_end = ctx->optval_end;
+       int *optval = ctx->optval;
+       char buf[TCP_CA_NAME_MAX];
+       char cc_reno[TCP_CA_NAME_MAX] = "reno";
+       char cc_cubic[TCP_CA_NAME_MAX] = "cubic";
+
+       if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS)
+               return 1;
+
+       if (optval + 1 > optval_end)
+               return 0; /* EPERM, bounds check */
+
+       if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf)))
+               return 0;
+
+       if (!tcp_cc_eq(buf, cc_cubic))
+               return 0;
+
+       if (*optval == 0x2d) {
+               if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno,
+                               sizeof(cc_reno)))
+                       return 0;
+       }
+       return 1;
+}
index 8acdb99..79c8139 100644 (file)
@@ -33,6 +33,14 @@ int _getsockopt(struct bpf_sockopt *ctx)
        __u8 *optval = ctx->optval;
        struct sockopt_sk *storage;
 
+       /* Make sure bpf_get_netns_cookie is callable.
+        */
+       if (bpf_get_netns_cookie(NULL) == 0)
+               return 0;
+
+       if (bpf_get_netns_cookie(ctx) == 0)
+               return 0;
+
        if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
                /* Not interested in SOL_IP:IP_TOS;
                 * let next BPF program in the cgroup chain or kernel
@@ -123,6 +131,14 @@ int _setsockopt(struct bpf_sockopt *ctx)
        __u8 *optval = ctx->optval;
        struct sockopt_sk *storage;
 
+       /* Make sure bpf_get_netns_cookie is callable.
+        */
+       if (bpf_get_netns_cookie(NULL) == 0)
+               return 0;
+
+       if (bpf_get_netns_cookie(ctx) == 0)
+               return 0;
+
        if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
                /* Not interested in SOL_IP:IP_TOS;
                 * let next BPF program in the cgroup chain or kernel
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
new file mode 100644 (file)
index 0000000..2d3a771
--- /dev/null
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int my_tid;
+
+int kprobe_res;
+int kprobe_multi_res;
+int kretprobe_res;
+int uprobe_res;
+int uretprobe_res;
+int tp_res;
+int pe_res;
+
+static void update(void *ctx, int *res)
+{
+       if (my_tid != (u32)bpf_get_current_pid_tgid())
+               return;
+
+       *res |= bpf_get_attach_cookie(ctx);
+}
+
+SEC("kprobe/sys_nanosleep")
+int handle_kprobe(struct pt_regs *ctx)
+{
+       update(ctx, &kprobe_res);
+       return 0;
+}
+
+SEC("kretprobe/sys_nanosleep")
+int handle_kretprobe(struct pt_regs *ctx)
+{
+       update(ctx, &kretprobe_res);
+       return 0;
+}
+
+SEC("uprobe/trigger_func")
+int handle_uprobe(struct pt_regs *ctx)
+{
+       update(ctx, &uprobe_res);
+       return 0;
+}
+
+SEC("uretprobe/trigger_func")
+int handle_uretprobe(struct pt_regs *ctx)
+{
+       update(ctx, &uretprobe_res);
+       return 0;
+}
+
+/* bpf_prog_array, used by kernel internally to keep track of attached BPF
+ * programs to a given BPF hook (e.g., for tracepoints) doesn't allow the same
+ * BPF program to be attached multiple times. So have three identical copies
+ * ready to attach to the same tracepoint.
+ */
+SEC("tp/syscalls/sys_enter_nanosleep")
+int handle_tp1(struct pt_regs *ctx)
+{
+       update(ctx, &tp_res);
+       return 0;
+}
+SEC("tp/syscalls/sys_enter_nanosleep")
+int handle_tp2(struct pt_regs *ctx)
+{
+       update(ctx, &tp_res);
+       return 0;
+}
+SEC("tp/syscalls/sys_enter_nanosleep")
+int handle_tp3(void *ctx)
+{
+       update(ctx, &tp_res);
+       return 1;
+}
+
+SEC("perf_event")
+int handle_pe(struct pt_regs *ctx)
+{
+       update(ctx, &pe_res);
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index 44f5aa2..9a7829c 100644 (file)
@@ -125,6 +125,16 @@ int handle_downsize(void *ctx)
        return 0;
 }
 
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define bpf_core_read_int bpf_core_read
+#else
+#define bpf_core_read_int(dst, sz, src) ({ \
+       /* Prevent "subtraction from stack pointer prohibited" */ \
+       volatile long __off = sizeof(*dst) - (sz); \
+       bpf_core_read((char *)(dst) + __off, sz, src); \
+})
+#endif
+
 SEC("raw_tp/sys_enter")
 int handle_probed(void *ctx)
 {
@@ -132,23 +142,23 @@ int handle_probed(void *ctx)
        __u64 tmp;
 
        tmp = 0;
-       bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr);
+       bpf_core_read_int(&tmp, bpf_core_field_size(in->ptr), &in->ptr);
        ptr_probed = tmp;
 
        tmp = 0;
-       bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1);
+       bpf_core_read_int(&tmp, bpf_core_field_size(in->val1), &in->val1);
        val1_probed = tmp;
 
        tmp = 0;
-       bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2);
+       bpf_core_read_int(&tmp, bpf_core_field_size(in->val2), &in->val2);
        val2_probed = tmp;
 
        tmp = 0;
-       bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3);
+       bpf_core_read_int(&tmp, bpf_core_field_size(in->val3), &in->val3);
        val3_probed = tmp;
 
        tmp = 0;
-       bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4);
+       bpf_core_read_int(&tmp, bpf_core_field_size(in->val4), &in->val4);
        val4_probed = tmp;
 
        return 0;
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c
new file mode 100644 (file)
index 0000000..5f8379a
--- /dev/null
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test weak ksyms.
+ *
+ * Copyright (c) 2021 Google
+ */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+int out__existing_typed = -1;
+__u64 out__existing_typeless = -1;
+
+__u64 out__non_existent_typeless = -1;
+__u64 out__non_existent_typed = -1;
+
+/* existing weak symbols */
+
+/* test existing weak symbols can be resolved. */
+extern const struct rq runqueues __ksym __weak; /* typed */
+extern const void bpf_prog_active __ksym __weak; /* typeless */
+
+
+/* non-existent weak symbols. */
+
+/* typeless symbols, default to zero. */
+extern const void bpf_link_fops1 __ksym __weak;
+
+/* typed symbols, default to zero. */
+extern const int bpf_link_fops2 __ksym __weak;
+
+SEC("raw_tp/sys_enter")
+int pass_handler(const void *ctx)
+{
+       struct rq *rq;
+
+       /* tests existing symbols. */
+       rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0);
+       if (rq)
+               out__existing_typed = rq->cpu;
+       out__existing_typeless = (__u64)&bpf_prog_active;
+
+       /* tests non-existent symbols. */
+       out__non_existent_typeless = (__u64)&bpf_link_fops1;
+
+       /* tests non-existent symbols. */
+       out__non_existent_typed = (__u64)&bpf_link_fops2;
+
+       if (&bpf_link_fops2) /* can't happen */
+               out__non_existent_typed = (__u64)bpf_per_cpu_ptr(&bpf_link_fops2, 0);
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_perf_link.c b/tools/testing/selftests/bpf/progs/test_perf_link.c
new file mode 100644 (file)
index 0000000..c1db9fd
--- /dev/null
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int run_cnt = 0;
+
+SEC("perf_event")
+int handler(struct pt_regs *ctx)
+{
+       __sync_fetch_and_add(&run_cnt, 1);
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index e2ad261..8fda075 100644 (file)
@@ -59,9 +59,9 @@ int handler(const void *ctx)
        /* Kernel pointers */
        addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p",
                                0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55);
-       /* Strings embedding */
-       str_ret  = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s",
-                               str1, longstr);
+       /* Strings and single-byte character embedding */
+       str_ret  = BPF_SNPRINTF(str_out, sizeof(str_out), "%s % 9c %+2c %-3c %04c %0c %+05s",
+                               str1, 'a', 'b', 'c', 'd', 'e', longstr);
        /* Overflow */
        over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow");
        /* Padding of fixed width numbers */
diff --git a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c
new file mode 100644 (file)
index 0000000..6c059f1
--- /dev/null
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct pt_regs current_regs = {};
+struct pt_regs ctx_regs = {};
+int uprobe_res = 0;
+
+SEC("uprobe/trigger_func")
+int handle_uprobe(struct pt_regs *ctx)
+{
+       struct task_struct *current;
+       struct pt_regs *regs;
+
+       current = bpf_get_current_task_btf();
+       regs = (struct pt_regs *) bpf_task_pt_regs(current);
+       __builtin_memcpy(&current_regs, regs, sizeof(*regs));
+       __builtin_memcpy(&ctx_regs, ctx, sizeof(*ctx));
+
+       /* Prove that uprobe was run */
+       uprobe_res = 1;
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index 6669077..718f596 100755 (executable)
@@ -2,4 +2,10 @@
 # SPDX-License-Identifier: GPL-2.0
 # Copyright (c) 2020 SUSE LLC.
 
+# 'make -C tools/testing/selftests/bpf install' will install to SCRIPT_DIR
+SCRIPT_DIR=$(dirname $(realpath $0))
+
+# 'make -C tools/testing/selftests/bpf' will install to BPFTOOL_INSTALL_PATH
+BPFTOOL_INSTALL_PATH="$SCRIPT_DIR"/tools/sbin
+export PATH=$SCRIPT_DIR:$BPFTOOL_INSTALL_PATH:$PATH
 python3 -m unittest -v test_bpftool.TestBpftool
index ac349a5..b03a875 100755 (executable)
@@ -22,7 +22,7 @@ KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../)
 cd $KDIR_ROOT_DIR
 if [ ! -e tools/bpf/bpftool/Makefile ]; then
        echo -e "skip:    bpftool files not found!\n"
-       exit 0
+       exit 4 # KSFT_SKIP=4
 fi
 
 ERROR=0
index ed12111..679cf96 100755 (executable)
@@ -4,11 +4,17 @@ set -e
 
 # Assume script is located under tools/testing/selftests/bpf/. We want to start
 # build attempts from the top of kernel repository.
-SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0)
+SCRIPT_REL_PATH=$(realpath $0)
 SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH)
-KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../)
+KDIR_ROOT_DIR=$(realpath $SCRIPT_REL_DIR/../../../../)
+SCRIPT_REL_DIR=$(dirname $(realpath --relative-to=$KDIR_ROOT_DIR $SCRIPT_REL_PATH))
 cd $KDIR_ROOT_DIR
 
+if [ ! -e $PWD/$SCRIPT_REL_DIR/Makefile ]; then
+       echo -e "skip:    bpftool files not found!\n"
+       exit 4 # KSFT_SKIP=4
+fi
+
 for tgt in docs docs-clean; do
        make -s -C $PWD/$SCRIPT_REL_DIR $tgt;
 done
index 14cea86..c7a36a9 100644 (file)
@@ -985,7 +985,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 
                FD_ZERO(&w);
                FD_SET(sfd[3], &w);
-               to.tv_sec = 1;
+               to.tv_sec = 30;
                to.tv_usec = 0;
                s = select(sfd[3] + 1, &w, NULL, NULL, &to);
                if (s == -1) {
@@ -1396,15 +1396,22 @@ static void test_map_stress(void)
 #define DO_DELETE 0
 
 #define MAP_RETRIES 20
+#define MAX_DELAY_US 50000
+#define MIN_DELAY_RANGE_US 5000
 
 static int map_update_retriable(int map_fd, const void *key, const void *value,
                                int flags, int attempts)
 {
+       int delay = rand() % MIN_DELAY_RANGE_US;
+
        while (bpf_map_update_elem(map_fd, key, value, flags)) {
                if (!attempts || (errno != EAGAIN && errno != EBUSY))
                        return -errno;
 
-               usleep(1);
+               if (delay <= MAX_DELAY_US / 2)
+                       delay *= 2;
+
+               usleep(delay);
                attempts--;
        }
 
@@ -1413,11 +1420,16 @@ static int map_update_retriable(int map_fd, const void *key, const void *value,
 
 static int map_delete_retriable(int map_fd, const void *key, int attempts)
 {
+       int delay = rand() % MIN_DELAY_RANGE_US;
+
        while (bpf_map_delete_elem(map_fd, key)) {
                if (!attempts || (errno != EAGAIN && errno != EBUSY))
                        return -errno;
 
-               usleep(1);
+               if (delay <= MAX_DELAY_US / 2)
+                       delay *= 2;
+
+               usleep(delay);
                attempts--;
        }
 
index 6f10310..cc1cd24 100644 (file)
 #include <execinfo.h> /* backtrace */
 #include <linux/membarrier.h>
 
+/* Adapted from perf/util/string.c */
+static bool glob_match(const char *str, const char *pat)
+{
+       while (*str && *pat && *pat != '*') {
+               if (*str != *pat)
+                       return false;
+               str++;
+               pat++;
+       }
+       /* Check wild card */
+       if (*pat == '*') {
+               while (*pat == '*')
+                       pat++;
+               if (!*pat) /* Tail wild card matches all */
+                       return true;
+               while (*str)
+                       if (glob_match(str++, pat))
+                               return true;
+       }
+       return !*str && !*pat;
+}
+
 #define EXIT_NO_TEST           2
 #define EXIT_ERR_SETUP_INFRA   3
 
@@ -55,12 +77,12 @@ static bool should_run(struct test_selector *sel, int num, const char *name)
        int i;
 
        for (i = 0; i < sel->blacklist.cnt; i++) {
-               if (strstr(name, sel->blacklist.strs[i]))
+               if (glob_match(name, sel->blacklist.strs[i]))
                        return false;
        }
 
        for (i = 0; i < sel->whitelist.cnt; i++) {
-               if (strstr(name, sel->whitelist.strs[i]))
+               if (glob_match(name, sel->whitelist.strs[i]))
                        return true;
        }
 
@@ -148,18 +170,18 @@ void test__end_subtest()
        struct prog_test_def *test = env.test;
        int sub_error_cnt = test->error_cnt - test->old_error_cnt;
 
+       dump_test_log(test, sub_error_cnt);
+
+       fprintf(env.stdout, "#%d/%d %s/%s:%s\n",
+              test->test_num, test->subtest_num, test->test_name, test->subtest_name,
+              sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK"));
+
        if (sub_error_cnt)
                env.fail_cnt++;
        else if (test->skip_cnt == 0)
                env.sub_succ_cnt++;
        skip_account();
 
-       dump_test_log(test, sub_error_cnt);
-
-       fprintf(env.stdout, "#%d/%d %s:%s\n",
-              test->test_num, test->subtest_num, test->subtest_name,
-              sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK"));
-
        free(test->subtest_name);
        test->subtest_name = NULL;
 }
@@ -450,6 +472,8 @@ enum ARG_KEYS {
        ARG_VERBOSE = 'v',
        ARG_GET_TEST_CNT = 'c',
        ARG_LIST_TEST_NAMES = 'l',
+       ARG_TEST_NAME_GLOB_ALLOWLIST = 'a',
+       ARG_TEST_NAME_GLOB_DENYLIST = 'd',
 };
 
 static const struct argp_option opts[] = {
@@ -467,6 +491,10 @@ static const struct argp_option opts[] = {
          "Get number of selected top-level tests " },
        { "list", ARG_LIST_TEST_NAMES, NULL, 0,
          "List test names that would run (without running them) " },
+       { "allow", ARG_TEST_NAME_GLOB_ALLOWLIST, "NAMES", 0,
+         "Run tests with name matching the pattern (supports '*' wildcard)." },
+       { "deny", ARG_TEST_NAME_GLOB_DENYLIST, "NAMES", 0,
+         "Don't run tests with name matching the pattern (supports '*' wildcard)." },
        {},
 };
 
@@ -491,36 +519,48 @@ static void free_str_set(const struct str_set *set)
        free(set->strs);
 }
 
-static int parse_str_list(const char *s, struct str_set *set)
+static int parse_str_list(const char *s, struct str_set *set, bool is_glob_pattern)
 {
        char *input, *state = NULL, *next, **tmp, **strs = NULL;
-       int cnt = 0;
+       int i, cnt = 0;
 
        input = strdup(s);
        if (!input)
                return -ENOMEM;
 
-       set->cnt = 0;
-       set->strs = NULL;
-
        while ((next = strtok_r(state ? NULL : input, ",", &state))) {
                tmp = realloc(strs, sizeof(*strs) * (cnt + 1));
                if (!tmp)
                        goto err;
                strs = tmp;
 
-               strs[cnt] = strdup(next);
-               if (!strs[cnt])
-                       goto err;
+               if (is_glob_pattern) {
+                       strs[cnt] = strdup(next);
+                       if (!strs[cnt])
+                               goto err;
+               } else {
+                       strs[cnt] = malloc(strlen(next) + 2 + 1);
+                       if (!strs[cnt])
+                               goto err;
+                       sprintf(strs[cnt], "*%s*", next);
+               }
 
                cnt++;
        }
 
-       set->cnt = cnt;
-       set->strs = (const char **)strs;
+       tmp = realloc(set->strs, sizeof(*strs) * (cnt + set->cnt));
+       if (!tmp)
+               goto err;
+       memcpy(tmp + set->cnt, strs, sizeof(*strs) * cnt);
+       set->strs = (const char **)tmp;
+       set->cnt += cnt;
+
        free(input);
+       free(strs);
        return 0;
 err:
+       for (i = 0; i < cnt; i++)
+               free(strs[i]);
        free(strs);
        free(input);
        return -ENOMEM;
@@ -553,29 +593,35 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
                }
                break;
        }
+       case ARG_TEST_NAME_GLOB_ALLOWLIST:
        case ARG_TEST_NAME: {
                char *subtest_str = strchr(arg, '/');
 
                if (subtest_str) {
                        *subtest_str = '\0';
                        if (parse_str_list(subtest_str + 1,
-                                          &env->subtest_selector.whitelist))
+                                          &env->subtest_selector.whitelist,
+                                          key == ARG_TEST_NAME_GLOB_ALLOWLIST))
                                return -ENOMEM;
                }
-               if (parse_str_list(arg, &env->test_selector.whitelist))
+               if (parse_str_list(arg, &env->test_selector.whitelist,
+                                  key == ARG_TEST_NAME_GLOB_ALLOWLIST))
                        return -ENOMEM;
                break;
        }
+       case ARG_TEST_NAME_GLOB_DENYLIST:
        case ARG_TEST_NAME_BLACKLIST: {
                char *subtest_str = strchr(arg, '/');
 
                if (subtest_str) {
                        *subtest_str = '\0';
                        if (parse_str_list(subtest_str + 1,
-                                          &env->subtest_selector.blacklist))
+                                          &env->subtest_selector.blacklist,
+                                          key == ARG_TEST_NAME_GLOB_DENYLIST))
                                return -ENOMEM;
                }
-               if (parse_str_list(arg, &env->test_selector.blacklist))
+               if (parse_str_list(arg, &env->test_selector.blacklist,
+                                  key == ARG_TEST_NAME_GLOB_DENYLIST))
                        return -ENOMEM;
                break;
        }
@@ -755,7 +801,7 @@ int main(int argc, char **argv)
        save_netns();
        stdio_hijack();
        env.has_testmod = true;
-       if (load_bpf_testmod()) {
+       if (!env.list_test_names && load_bpf_testmod()) {
                fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n");
                env.has_testmod = false;
        }
@@ -786,24 +832,25 @@ int main(int argc, char **argv)
                        test__end_subtest();
 
                test->tested = true;
-               if (test->error_cnt)
-                       env.fail_cnt++;
-               else
-                       env.succ_cnt++;
-               skip_account();
 
                dump_test_log(test, test->error_cnt);
 
                fprintf(env.stdout, "#%d %s:%s\n",
                        test->test_num, test->test_name,
-                       test->error_cnt ? "FAIL" : "OK");
+                       test->error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK"));
+
+               if (test->error_cnt)
+                       env.fail_cnt++;
+               else
+                       env.succ_cnt++;
+               skip_account();
 
                reset_affinity();
                restore_netns();
                if (test->need_cgroup_cleanup)
                        cleanup_cgroup_environment();
        }
-       if (env.has_testmod)
+       if (!env.list_test_names && env.has_testmod)
                unload_bpf_testmod();
        stdio_restore();
 
index 46633a3..cd7bf32 100755 (executable)
 # ----------------
 # Must run with CAP_NET_ADMIN capability.
 #
-# Run (full color-coded output):
-#   sudo ./test_xsk.sh -c
+# Run:
+#   sudo ./test_xsk.sh
 #
 # If running from kselftests:
-#   sudo make colorconsole=1 run_tests
-#
-# Run (full output without color-coding):
-#   sudo ./test_xsk.sh
+#   sudo make run_tests
 #
 # Run with verbose output:
 #   sudo ./test_xsk.sh -v
@@ -83,7 +80,6 @@
 while getopts "cvD" flag
 do
        case "${flag}" in
-               c) colorconsole=1;;
                v) verbose=1;;
                D) dump_pkts=1;;
        esac
index 1bbd1d9..e7a19b0 100644 (file)
@@ -136,3 +136,90 @@ void read_trace_pipe(void)
                }
        }
 }
+
+#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2
+
+#define OP_RT_RA_MASK   0xffff0000UL
+#define LIS_R2          0x3c400000UL
+#define ADDIS_R2_R12    0x3c4c0000UL
+#define ADDI_R2_R2      0x38420000UL
+
+ssize_t get_uprobe_offset(const void *addr, ssize_t base)
+{
+       u32 *insn = (u32 *)(uintptr_t)addr;
+
+       /*
+        * A PPC64 ABIv2 function may have a local and a global entry
+        * point. We need to use the local entry point when patching
+        * functions, so identify and step over the global entry point
+        * sequence.
+        *
+        * The global entry point sequence is always of the form:
+        *
+        * addis r2,r12,XXXX
+        * addi  r2,r2,XXXX
+        *
+        * A linker optimisation may convert the addis to lis:
+        *
+        * lis   r2,XXXX
+        * addi  r2,r2,XXXX
+        */
+       if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+            ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
+           ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2))
+               return (ssize_t)(insn + 2) - base;
+       else
+               return (uintptr_t)addr - base;
+}
+
+#else
+
+ssize_t get_uprobe_offset(const void *addr, ssize_t base)
+{
+       return (uintptr_t)addr - base;
+}
+
+#endif
+
+ssize_t get_base_addr(void)
+{
+       size_t start, offset;
+       char buf[256];
+       FILE *f;
+
+       f = fopen("/proc/self/maps", "r");
+       if (!f)
+               return -errno;
+
+       while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n",
+                     &start, buf, &offset) == 3) {
+               if (strcmp(buf, "r-xp") == 0) {
+                       fclose(f);
+                       return start - offset;
+               }
+       }
+
+       fclose(f);
+       return -EINVAL;
+}
+
+ssize_t get_rel_offset(uintptr_t addr)
+{
+       size_t start, end, offset;
+       char buf[256];
+       FILE *f;
+
+       f = fopen("/proc/self/maps", "r");
+       if (!f)
+               return -errno;
+
+       while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) {
+               if (addr >= start && addr < end) {
+                       fclose(f);
+                       return (size_t)addr - start + offset;
+               }
+       }
+
+       fclose(f);
+       return -EINVAL;
+}
index f62fdef..d907b44 100644 (file)
@@ -18,4 +18,8 @@ int kallsyms_find(const char *sym, unsigned long long *addr);
 
 void read_trace_pipe(void);
 
+ssize_t get_uprobe_offset(const void *addr, ssize_t base);
+ssize_t get_base_addr(void);
+ssize_t get_rel_offset(uintptr_t addr);
+
 #endif
index 1135fb9..f53ce26 100644 (file)
@@ -70,7 +70,6 @@
 #include <errno.h>
 #include <getopt.h>
 #include <asm/barrier.h>
-typedef __u16 __sum16;
 #include <linux/if_link.h>
 #include <linux/if_ether.h>
 #include <linux/ip.h>
@@ -106,14 +105,9 @@ static const u16 UDP_PORT2 = 2121;
 
 static void __exit_with_error(int error, const char *file, const char *func, int line)
 {
-       if (configured_mode == TEST_MODE_UNCONFIGURED) {
-               ksft_exit_fail_msg
-               ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error));
-       } else {
-               ksft_test_result_fail
-               ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error));
-               ksft_exit_xfail();
-       }
+       ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error,
+                             strerror(error));
+       ksft_exit_xfail();
 }
 
 #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
@@ -126,7 +120,7 @@ static void __exit_with_error(int error, const char *file, const char *func, int
                               test_type == TEST_TYPE_STATS ? "Stats" : "",\
                               test_type == TEST_TYPE_BPF_RES ? "BPF RES" : ""))
 
-static void *memset32_htonl(void *dest, u32 val, u32 size)
+static void memset32_htonl(void *dest, u32 val, u32 size)
 {
        u32 *ptr = (u32 *)dest;
        int i;
@@ -135,11 +129,6 @@ static void *memset32_htonl(void *dest, u32 val, u32 size)
 
        for (i = 0; i < (size & (~0x3)); i += 4)
                ptr[i >> 2] = val;
-
-       for (; i < size; i++)
-               ((char *)dest)[i] = ((char *)&val)[i & 3];
-
-       return dest;
 }
 
 /*
@@ -230,13 +219,13 @@ static void gen_ip_hdr(struct ifobject *ifobject, struct iphdr *ip_hdr)
        ip_hdr->check = 0;
 }
 
-static void gen_udp_hdr(struct generic_data *data, struct ifobject *ifobject,
+static void gen_udp_hdr(u32 payload, void *pkt, struct ifobject *ifobject,
                        struct udphdr *udp_hdr)
 {
        udp_hdr->source = htons(ifobject->src_port);
        udp_hdr->dest = htons(ifobject->dst_port);
        udp_hdr->len = htons(UDP_PKT_SIZE);
-       memset32_htonl(pkt_data + PKT_HDR_SIZE, htonl(data->seqnum), UDP_PKT_DATA_SIZE);
+       memset32_htonl(pkt + PKT_HDR_SIZE, payload, UDP_PKT_DATA_SIZE);
 }
 
 static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr)
@@ -246,12 +235,7 @@ static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr)
            udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr);
 }
 
-static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
-{
-       memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE);
-}
-
-static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx)
+static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size, int idx)
 {
        struct xsk_umem_config cfg = {
                .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
@@ -260,7 +244,6 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx)
                .frame_headroom = frame_headroom,
                .flags = XSK_UMEM__DEFAULT_FLAGS
        };
-       int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
        struct xsk_umem_info *umem;
        int ret;
 
@@ -271,7 +254,7 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx)
        ret = xsk_umem__create(&umem->umem, buffer, size,
                               &umem->fq, &umem->cq, &cfg);
        if (ret)
-               exit_with_error(ret);
+               exit_with_error(-ret);
 
        umem->buffer = buffer;
 
@@ -285,7 +268,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
 
        ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx);
        if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
-               exit_with_error(ret);
+               exit_with_error(-ret);
        for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++)
                *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
        xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS);
@@ -333,20 +316,19 @@ static struct option long_options[] = {
        {"queue", optional_argument, 0, 'q'},
        {"dump-pkts", optional_argument, 0, 'D'},
        {"verbose", no_argument, 0, 'v'},
-       {"tx-pkt-count", optional_argument, 0, 'C'},
        {0, 0, 0, 0}
 };
 
 static void usage(const char *prog)
 {
        const char *str =
-           "  Usage: %s [OPTIONS]\n"
-           "  Options:\n"
-           "  -i, --interface      Use interface\n"
-           "  -q, --queue=n        Use queue n (default 0)\n"
-           "  -D, --dump-pkts      Dump packets L2 - L5\n"
-           "  -v, --verbose        Verbose output\n"
-           "  -C, --tx-pkt-count=n Number of packets to send\n";
+               "  Usage: %s [OPTIONS]\n"
+               "  Options:\n"
+               "  -i, --interface      Use interface\n"
+               "  -q, --queue=n        Use queue n (default 0)\n"
+               "  -D, --dump-pkts      Dump packets L2 - L5\n"
+               "  -v, --verbose        Verbose output\n";
+
        ksft_print_msg(str, prog);
 }
 
@@ -392,7 +374,7 @@ static void parse_command_line(int argc, char **argv)
        opterr = 0;
 
        for (;;) {
-               c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index);
+               c = getopt_long(argc, argv, "i:Dv", long_options, &option_index);
 
                if (c == -1)
                        break;
@@ -413,13 +395,10 @@ static void parse_command_line(int argc, char **argv)
                        interface_index++;
                        break;
                case 'D':
-                       debug_pkt_dump = 1;
-                       break;
-               case 'C':
-                       opt_pkt_count = atoi(optarg);
+                       opt_pkt_dump = true;
                        break;
                case 'v':
-                       opt_verbose = 1;
+                       opt_verbose = true;
                        break;
                default:
                        usage(basename(argv[0]));
@@ -427,17 +406,143 @@ static void parse_command_line(int argc, char **argv)
                }
        }
 
-       if (!opt_pkt_count) {
-               print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT);
-               opt_pkt_count = DEFAULT_PKT_CNT;
-       }
-
        if (!validate_interfaces()) {
                usage(basename(argv[0]));
                ksft_exit_xfail();
        }
 }
 
+static struct pkt *pkt_stream_get_pkt(struct pkt_stream *pkt_stream, u32 pkt_nb)
+{
+       if (pkt_nb >= pkt_stream->nb_pkts)
+               return NULL;
+
+       return &pkt_stream->pkts[pkt_nb];
+}
+
+static struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len)
+{
+       struct pkt_stream *pkt_stream;
+       u32 i;
+
+       pkt_stream = malloc(sizeof(*pkt_stream));
+       if (!pkt_stream)
+               exit_with_error(ENOMEM);
+
+       pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts));
+       if (!pkt_stream->pkts)
+               exit_with_error(ENOMEM);
+
+       pkt_stream->nb_pkts = nb_pkts;
+       for (i = 0; i < nb_pkts; i++) {
+               pkt_stream->pkts[i].addr = (i % num_frames) * XSK_UMEM__DEFAULT_FRAME_SIZE;
+               pkt_stream->pkts[i].len = pkt_len;
+               pkt_stream->pkts[i].payload = i;
+       }
+
+       return pkt_stream;
+}
+
+static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb)
+{
+       struct pkt *pkt = pkt_stream_get_pkt(ifobject->pkt_stream, pkt_nb);
+       struct udphdr *udp_hdr;
+       struct ethhdr *eth_hdr;
+       struct iphdr *ip_hdr;
+       void *data;
+
+       if (!pkt)
+               return NULL;
+
+       data = xsk_umem__get_data(ifobject->umem->buffer, pkt->addr);
+       udp_hdr = (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr));
+       ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr));
+       eth_hdr = (struct ethhdr *)data;
+
+       gen_udp_hdr(pkt_nb, data, ifobject, udp_hdr);
+       gen_ip_hdr(ifobject, ip_hdr);
+       gen_udp_csum(udp_hdr, ip_hdr);
+       gen_eth_hdr(ifobject, eth_hdr);
+
+       return pkt;
+}
+
+static void pkt_dump(void *pkt, u32 len)
+{
+       char s[INET_ADDRSTRLEN];
+       struct ethhdr *ethhdr;
+       struct udphdr *udphdr;
+       struct iphdr *iphdr;
+       int payload, i;
+
+       ethhdr = pkt;
+       iphdr = pkt + sizeof(*ethhdr);
+       udphdr = pkt + sizeof(*ethhdr) + sizeof(*iphdr);
+
+       /*extract L2 frame */
+       fprintf(stdout, "DEBUG>> L2: dst mac: ");
+       for (i = 0; i < ETH_ALEN; i++)
+               fprintf(stdout, "%02X", ethhdr->h_dest[i]);
+
+       fprintf(stdout, "\nDEBUG>> L2: src mac: ");
+       for (i = 0; i < ETH_ALEN; i++)
+               fprintf(stdout, "%02X", ethhdr->h_source[i]);
+
+       /*extract L3 frame */
+       fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl);
+       fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n",
+               inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s)));
+       fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n",
+               inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s)));
+       /*extract L4 frame */
+       fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source));
+       fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest));
+       /*extract L5 frame */
+       payload = *((uint32_t *)(pkt + PKT_HDR_SIZE));
+
+       fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload);
+       fprintf(stdout, "---------------------------------------\n");
+}
+
+static bool is_pkt_valid(struct pkt *pkt, void *buffer, const struct xdp_desc *desc)
+{
+       void *data = xsk_umem__get_data(buffer, desc->addr);
+       struct iphdr *iphdr = (struct iphdr *)(data + sizeof(struct ethhdr));
+
+       if (!pkt) {
+               ksft_test_result_fail("ERROR: [%s] too many packets received\n", __func__);
+               return false;
+       }
+
+       if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) {
+               u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE)));
+
+               if (opt_pkt_dump && test_type != TEST_TYPE_STATS)
+                       pkt_dump(data, PKT_SIZE);
+
+               if (pkt->len != desc->len) {
+                       ksft_test_result_fail
+                               ("ERROR: [%s] expected length [%d], got length [%d]\n",
+                                       __func__, pkt->len, desc->len);
+                       return false;
+               }
+
+               if (pkt->payload != seqnum) {
+                       ksft_test_result_fail
+                               ("ERROR: [%s] expected seqnum [%d], got seqnum [%d]\n",
+                                       __func__, pkt->payload, seqnum);
+                       return false;
+               }
+       } else {
+               ksft_print_msg("Invalid frame received: ");
+               ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version,
+                              iphdr->tos);
+               return false;
+       }
+
+       return true;
+}
+
 static void kick_tx(struct xsk_socket_info *xsk)
 {
        int ret;
@@ -448,7 +553,7 @@ static void kick_tx(struct xsk_socket_info *xsk)
        exit_with_error(errno);
 }
 
-static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
+static void complete_pkts(struct xsk_socket_info *xsk, int batch_size)
 {
        unsigned int rcvd;
        u32 idx;
@@ -463,133 +568,108 @@ static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
        if (rcvd) {
                xsk_ring_cons__release(&xsk->umem->cq, rcvd);
                xsk->outstanding_tx -= rcvd;
-               xsk->tx_npkts += rcvd;
        }
 }
 
-static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds)
+static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *xsk,
+                        struct pollfd *fds)
 {
-       unsigned int rcvd, i;
-       u32 idx_rx = 0, idx_fq = 0;
+       u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkt_count = 0;
+       struct pkt *pkt;
        int ret;
 
-       rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
-       if (!rcvd) {
-               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
-                       ret = poll(fds, 1, POLL_TMOUT);
-                       if (ret < 0)
-                               exit_with_error(ret);
+       pkt = pkt_stream_get_pkt(pkt_stream, pkt_count++);
+       while (pkt) {
+               rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+               if (!rcvd) {
+                       if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+                               ret = poll(fds, 1, POLL_TMOUT);
+                               if (ret < 0)
+                                       exit_with_error(-ret);
+                       }
+                       continue;
                }
-               return;
-       }
 
-       ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
-       while (ret != rcvd) {
-               if (ret < 0)
-                       exit_with_error(ret);
-               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
-                       ret = poll(fds, 1, POLL_TMOUT);
+               ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+               while (ret != rcvd) {
                        if (ret < 0)
-                               exit_with_error(ret);
+                               exit_with_error(-ret);
+                       if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+                               ret = poll(fds, 1, POLL_TMOUT);
+                               if (ret < 0)
+                                       exit_with_error(-ret);
+                       }
+                       ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
                }
-               ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
-       }
-
-       for (i = 0; i < rcvd; i++) {
-               u64 addr, orig;
-
-               addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
-               xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++);
-               orig = xsk_umem__extract_addr(addr);
 
-               addr = xsk_umem__add_offset_to_addr(addr);
-               pkt_node_rx = malloc(sizeof(struct pkt) + PKT_SIZE);
-               if (!pkt_node_rx)
-                       exit_with_error(errno);
+               for (i = 0; i < rcvd; i++) {
+                       const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++);
+                       u64 addr = desc->addr, orig;
 
-               pkt_node_rx->pkt_frame = malloc(PKT_SIZE);
-               if (!pkt_node_rx->pkt_frame)
-                       exit_with_error(errno);
+                       orig = xsk_umem__extract_addr(addr);
+                       addr = xsk_umem__add_offset_to_addr(addr);
+                       if (!is_pkt_valid(pkt, xsk->umem->buffer, desc))
+                               return;
 
-               memcpy(pkt_node_rx->pkt_frame, xsk_umem__get_data(xsk->umem->buffer, addr),
-                      PKT_SIZE);
-
-               TAILQ_INSERT_HEAD(&head, pkt_node_rx, pkt_nodes);
+                       *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
+                       pkt = pkt_stream_get_pkt(pkt_stream, pkt_count++);
+               }
 
-               *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
+               xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+               xsk_ring_cons__release(&xsk->rx, rcvd);
        }
-
-       xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
-       xsk_ring_cons__release(&xsk->rx, rcvd);
-       xsk->rx_npkts += rcvd;
 }
 
-static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size)
+static u32 __send_pkts(struct ifobject *ifobject, u32 pkt_nb)
 {
-       u32 idx = 0;
-       unsigned int i;
-       bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID;
-       u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE;
+       struct xsk_socket_info *xsk = ifobject->xsk;
+       u32 i, idx;
 
-       while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size)
-               complete_tx_only(xsk, batch_size);
+       while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE)
+               complete_pkts(xsk, BATCH_SIZE);
 
-       for (i = 0; i < batch_size; i++) {
+       for (i = 0; i < BATCH_SIZE; i++) {
                struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i);
+               struct pkt *pkt = pkt_generate(ifobject, pkt_nb);
 
-               tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
-               tx_desc->len = len;
-       }
+               if (!pkt)
+                       break;
 
-       xsk_ring_prod__submit(&xsk->tx, batch_size);
-       if (!tx_invalid_test) {
-               xsk->outstanding_tx += batch_size;
-       } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
-               kick_tx(xsk);
+               tx_desc->addr = pkt->addr;
+               tx_desc->len = pkt->len;
+               pkt_nb++;
        }
-       *frameptr += batch_size;
-       *frameptr %= num_frames;
-       complete_tx_only(xsk, batch_size);
-}
-
-static int get_batch_size(int pkt_cnt)
-{
-       if (!opt_pkt_count)
-               return BATCH_SIZE;
 
-       if (pkt_cnt + BATCH_SIZE <= opt_pkt_count)
-               return BATCH_SIZE;
+       xsk_ring_prod__submit(&xsk->tx, i);
+       if (stat_test_type != STAT_TEST_TX_INVALID)
+               xsk->outstanding_tx += i;
+       else if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+               kick_tx(xsk);
+       complete_pkts(xsk, i);
 
-       return opt_pkt_count - pkt_cnt;
+       return i;
 }
 
-static void complete_tx_only_all(struct ifobject *ifobject)
+static void wait_for_tx_completion(struct xsk_socket_info *xsk)
 {
-       bool pending;
-
-       do {
-               pending = false;
-               if (ifobject->xsk->outstanding_tx) {
-                       complete_tx_only(ifobject->xsk, BATCH_SIZE);
-                       pending = !!ifobject->xsk->outstanding_tx;
-               }
-       } while (pending);
+       while (xsk->outstanding_tx)
+               complete_pkts(xsk, BATCH_SIZE);
 }
 
-static void tx_only_all(struct ifobject *ifobject)
+static void send_pkts(struct ifobject *ifobject)
 {
        struct pollfd fds[MAX_SOCKS] = { };
-       u32 frame_nb = 0;
-       int pkt_cnt = 0;
-       int ret;
+       u32 pkt_cnt = 0;
 
        fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
        fds[0].events = POLLOUT;
 
-       while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
-               int batch_size = get_batch_size(pkt_cnt);
+       while (pkt_cnt < ifobject->pkt_stream->nb_pkts) {
+               u32 sent;
 
                if (test_type == TEST_TYPE_POLL) {
+                       int ret;
+
                        ret = poll(fds, 1, POLL_TMOUT);
                        if (ret <= 0)
                                continue;
@@ -598,78 +678,30 @@ static void tx_only_all(struct ifobject *ifobject)
                                continue;
                }
 
-               tx_only(ifobject->xsk, &frame_nb, batch_size);
-               pkt_cnt += batch_size;
+               sent = __send_pkts(ifobject, pkt_cnt);
+               pkt_cnt += sent;
+               usleep(10);
        }
 
-       if (opt_pkt_count)
-               complete_tx_only_all(ifobject);
+       wait_for_tx_completion(ifobject->xsk);
 }
 
-static void worker_pkt_dump(void)
-{
-       struct ethhdr *ethhdr;
-       struct iphdr *iphdr;
-       struct udphdr *udphdr;
-       char s[128];
-       int payload;
-       void *ptr;
-
-       fprintf(stdout, "---------------------------------------\n");
-       for (int iter = 0; iter < num_frames - 1; iter++) {
-               ptr = pkt_buf[iter]->payload;
-               ethhdr = ptr;
-               iphdr = ptr + sizeof(*ethhdr);
-               udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr);
-
-               /*extract L2 frame */
-               fprintf(stdout, "DEBUG>> L2: dst mac: ");
-               for (int i = 0; i < ETH_ALEN; i++)
-                       fprintf(stdout, "%02X", ethhdr->h_dest[i]);
-
-               fprintf(stdout, "\nDEBUG>> L2: src mac: ");
-               for (int i = 0; i < ETH_ALEN; i++)
-                       fprintf(stdout, "%02X", ethhdr->h_source[i]);
-
-               /*extract L3 frame */
-               fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl);
-               fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n",
-                       inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s)));
-               fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n",
-                       inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s)));
-               /*extract L4 frame */
-               fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source));
-               fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest));
-               /*extract L5 frame */
-               payload = *((uint32_t *)(ptr + PKT_HDR_SIZE));
-
-               if (payload == EOT) {
-                       print_verbose("End-of-transmission frame received\n");
-                       fprintf(stdout, "---------------------------------------\n");
-                       break;
-               }
-               fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload);
-               fprintf(stdout, "---------------------------------------\n");
-       }
-}
-
-static void worker_stats_validate(struct ifobject *ifobject)
+static bool rx_stats_are_valid(struct ifobject *ifobject)
 {
+       u32 xsk_stat = 0, expected_stat = ifobject->pkt_stream->nb_pkts;
+       struct xsk_socket *xsk = ifobject->xsk->xsk;
+       int fd = xsk_socket__fd(xsk);
        struct xdp_statistics stats;
        socklen_t optlen;
        int err;
-       struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ?
-                                                       ifdict[!ifobject->ifdict_index]->xsk->xsk :
-                                                       ifobject->xsk->xsk;
-       int fd = xsk_socket__fd(xsk);
-       unsigned long xsk_stat = 0, expected_stat = opt_pkt_count;
-
-       sigvar = 0;
 
        optlen = sizeof(stats);
        err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
-       if (err)
-               return;
+       if (err) {
+               ksft_test_result_fail("ERROR: [%s] getsockopt(XDP_STATISTICS) error %u %s\n",
+                                     __func__, -err, strerror(-err));
+               return true;
+       }
 
        if (optlen == sizeof(struct xdp_statistics)) {
                switch (stat_test_type) {
@@ -677,8 +709,7 @@ static void worker_stats_validate(struct ifobject *ifobject)
                        xsk_stat = stats.rx_dropped;
                        break;
                case STAT_TEST_TX_INVALID:
-                       xsk_stat = stats.tx_invalid_descs;
-                       break;
+                       return true;
                case STAT_TEST_RX_FULL:
                        xsk_stat = stats.rx_ring_full;
                        expected_stat -= RX_FULL_RXQSIZE;
@@ -691,99 +722,70 @@ static void worker_stats_validate(struct ifobject *ifobject)
                }
 
                if (xsk_stat == expected_stat)
-                       sigvar = 1;
+                       return true;
        }
+
+       return false;
 }
 
-static void worker_pkt_validate(void)
+static void tx_stats_validate(struct ifobject *ifobject)
 {
-       u32 payloadseqnum = -2;
-       struct iphdr *iphdr;
-
-       while (1) {
-               pkt_node_rx_q = TAILQ_LAST(&head, head_s);
-               if (!pkt_node_rx_q)
-                       break;
-
-               iphdr = (struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr));
-
-               /*do not increment pktcounter if !(tos=0x9 and ipv4) */
-               if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) {
-                       payloadseqnum = *((uint32_t *)(pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE));
-                       if (debug_pkt_dump && payloadseqnum != EOT) {
-                               pkt_obj = malloc(sizeof(*pkt_obj));
-                               pkt_obj->payload = malloc(PKT_SIZE);
-                               memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE);
-                               pkt_buf[payloadseqnum] = pkt_obj;
-                       }
-
-                       if (payloadseqnum == EOT) {
-                               print_verbose("End-of-transmission frame received: PASS\n");
-                               sigvar = 1;
-                               break;
-                       }
+       struct xsk_socket *xsk = ifobject->xsk->xsk;
+       int fd = xsk_socket__fd(xsk);
+       struct xdp_statistics stats;
+       socklen_t optlen;
+       int err;
 
-                       if (prev_pkt + 1 != payloadseqnum) {
-                               ksft_test_result_fail
-                                   ("ERROR: [%s] prev_pkt [%d], payloadseqnum [%d]\n",
-                                    __func__, prev_pkt, payloadseqnum);
-                               ksft_exit_xfail();
-                       }
+       optlen = sizeof(stats);
+       err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
+       if (err) {
+               ksft_test_result_fail("ERROR: [%s] getsockopt(XDP_STATISTICS) error %u %s\n",
+                                     __func__, -err, strerror(-err));
+               return;
+       }
 
-                       prev_pkt = payloadseqnum;
-                       pkt_counter++;
-               } else {
-                       ksft_print_msg("Invalid frame received: ");
-                       ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version,
-                                      iphdr->tos);
-               }
+       if (stats.tx_invalid_descs == ifobject->pkt_stream->nb_pkts)
+               return;
 
-               TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes);
-               free(pkt_node_rx_q->pkt_frame);
-               free(pkt_node_rx_q);
-               pkt_node_rx_q = NULL;
-       }
+       ksft_test_result_fail("ERROR: [%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n",
+                             __func__, stats.tx_invalid_descs, ifobject->pkt_stream->nb_pkts);
 }
 
 static void thread_common_ops(struct ifobject *ifobject, void *bufs)
 {
-       int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
+       u64 umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
+       int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+       size_t mmap_sz = umem_sz;
        int ctr = 0;
        int ret;
 
        ifobject->ns_fd = switch_namespace(ifobject->nsname);
 
        if (test_type == TEST_TYPE_BPF_RES)
-               umem_sz *= 2;
+               mmap_sz *= 2;
 
-       bufs = mmap(NULL, umem_sz,
-                   PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
        if (bufs == MAP_FAILED)
                exit_with_error(errno);
 
-       xsk_configure_umem(ifobject, bufs, 0);
-       ifobject->umem = ifobject->umem_arr[0];
-       ret = xsk_configure_socket(ifobject, 0);
-
-       /* Retry Create Socket if it fails as xsk_socket__create()
-        * is asynchronous
-        */
-       while (ret && ctr < SOCK_RECONF_CTR) {
-               xsk_configure_umem(ifobject, bufs, 0);
+       while (ctr++ < SOCK_RECONF_CTR) {
+               xsk_configure_umem(ifobject, bufs, umem_sz, 0);
                ifobject->umem = ifobject->umem_arr[0];
                ret = xsk_configure_socket(ifobject, 0);
+               if (!ret)
+                       break;
+
+               /* Retry Create Socket if it fails as xsk_socket__create() is asynchronous */
                usleep(USLEEP_MAX);
-               ctr++;
+               if (ctr >= SOCK_RECONF_CTR)
+                       exit_with_error(-ret);
        }
 
-       if (ctr >= SOCK_RECONF_CTR)
-               exit_with_error(ret);
-
        ifobject->umem = ifobject->umem_arr[0];
        ifobject->xsk = ifobject->xsk_arr[0];
 
        if (test_type == TEST_TYPE_BPF_RES) {
-               xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1);
+               xsk_configure_umem(ifobject, (u8 *)bufs + umem_sz, umem_sz, 1);
                ifobject->umem = ifobject->umem_arr[1];
                ret = xsk_configure_socket(ifobject, 1);
        }
@@ -809,33 +811,18 @@ static void testapp_cleanup_xsk_res(struct ifobject *ifobj)
 
 static void *worker_testapp_validate_tx(void *arg)
 {
-       struct udphdr *udp_hdr =
-           (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr));
-       struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr));
-       struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
        struct ifobject *ifobject = (struct ifobject *)arg;
-       struct generic_data data;
        void *bufs = NULL;
 
        if (!second_step)
                thread_common_ops(ifobject, bufs);
 
-       for (int i = 0; i < num_frames; i++) {
-               /*send EOT frame */
-               if (i == (num_frames - 1))
-                       data.seqnum = -1;
-               else
-                       data.seqnum = i;
-               gen_udp_hdr(&data, ifobject, udp_hdr);
-               gen_ip_hdr(ifobject, ip_hdr);
-               gen_udp_csum(udp_hdr, ip_hdr);
-               gen_eth_hdr(ifobject, eth_hdr);
-               gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE);
-       }
+       print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts,
+                     ifobject->ifname);
+       send_pkts(ifobject);
 
-       print_verbose("Sending %d packets on interface %s\n",
-                     (opt_pkt_count - 1), ifobject->ifname);
-       tx_only_all(ifobject);
+       if (stat_test_type == STAT_TEST_TX_INVALID)
+               tx_stats_validate(ifobject);
 
        testapp_cleanup_xsk_res(ifobject);
        pthread_exit(NULL);
@@ -853,31 +840,16 @@ static void *worker_testapp_validate_rx(void *arg)
        if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
                xsk_populate_fill_ring(ifobject->umem);
 
-       TAILQ_INIT(&head);
-       if (debug_pkt_dump) {
-               pkt_buf = calloc(num_frames, sizeof(*pkt_buf));
-               if (!pkt_buf)
-                       exit_with_error(errno);
-       }
-
        fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
        fds[0].events = POLLIN;
 
        pthread_barrier_wait(&barr);
 
-       while (1) {
-               if (test_type != TEST_TYPE_STATS) {
-                       rx_pkt(ifobject->xsk, fds);
-                       worker_pkt_validate();
-               } else {
-                       worker_stats_validate(ifobject);
-               }
-               if (sigvar)
-                       break;
-       }
-
-       print_verbose("Received %d packets on interface %s\n",
-                     pkt_counter, ifobject->ifname);
+       if (test_type == TEST_TYPE_STATS)
+               while (!rx_stats_are_valid(ifobject))
+                       continue;
+       else
+               receive_pkts(ifobject->pkt_stream, ifobject->xsk, fds);
 
        if (test_type == TEST_TYPE_TEARDOWN)
                print_verbose("Destroying socket\n");
@@ -890,10 +862,18 @@ static void testapp_validate(void)
 {
        bool bidi = test_type == TEST_TYPE_BIDI;
        bool bpf = test_type == TEST_TYPE_BPF_RES;
+       struct pkt_stream *pkt_stream;
 
        if (pthread_barrier_init(&barr, NULL, 2))
                exit_with_error(errno);
 
+       if (stat_test_type == STAT_TEST_TX_INVALID)
+               pkt_stream = pkt_stream_generate(DEFAULT_PKT_CNT, XSK_UMEM__INVALID_FRAME_SIZE);
+       else
+               pkt_stream = pkt_stream_generate(DEFAULT_PKT_CNT, PKT_SIZE);
+       ifdict_tx->pkt_stream = pkt_stream;
+       ifdict_rx->pkt_stream = pkt_stream;
+
        /*Spawn RX thread */
        pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx);
 
@@ -907,15 +887,6 @@ static void testapp_validate(void)
        pthread_join(t1, NULL);
        pthread_join(t0, NULL);
 
-       if (debug_pkt_dump && test_type != TEST_TYPE_STATS) {
-               worker_pkt_dump();
-               for (int iter = 0; iter < num_frames - 1; iter++) {
-                       free(pkt_buf[iter]->payload);
-                       free(pkt_buf[iter]);
-               }
-               free(pkt_buf);
-       }
-
        if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS))
                print_ksft_result();
 }
@@ -925,9 +896,6 @@ static void testapp_teardown(void)
        int i;
 
        for (i = 0; i < MAX_TEARDOWN_ITER; i++) {
-               pkt_counter = 0;
-               prev_pkt = -1;
-               sigvar = 0;
                print_verbose("Creating socket\n");
                testapp_validate();
        }
@@ -953,9 +921,6 @@ static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2)
 static void testapp_bidi(void)
 {
        for (int i = 0; i < MAX_BIDI_ITER; i++) {
-               pkt_counter = 0;
-               prev_pkt = -1;
-               sigvar = 0;
                print_verbose("Creating socket\n");
                testapp_validate();
                if (!second_step) {
@@ -987,9 +952,6 @@ static void testapp_bpf_res(void)
        int i;
 
        for (i = 0; i < MAX_BPF_ITER; i++) {
-               pkt_counter = 0;
-               prev_pkt = -1;
-               sigvar = 0;
                print_verbose("Creating socket\n");
                testapp_validate();
                if (!second_step)
@@ -1017,6 +979,8 @@ static void testapp_stats(void)
                case STAT_TEST_RX_FULL:
                        rxqsize = RX_FULL_RXQSIZE;
                        break;
+               case STAT_TEST_TX_INVALID:
+                       continue;
                default:
                        break;
                }
@@ -1062,10 +1026,7 @@ static void run_pkt_test(int mode, int type)
 
        /* reset defaults after potential previous test */
        xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-       pkt_counter = 0;
        second_step = 0;
-       prev_pkt = -1;
-       sigvar = 0;
        stat_test_type = -1;
        rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
        frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
@@ -1102,62 +1063,70 @@ static void run_pkt_test(int mode, int type)
        }
 }
 
+static struct ifobject *ifobject_create(void)
+{
+       struct ifobject *ifobj;
+
+       ifobj = calloc(1, sizeof(struct ifobject));
+       if (!ifobj)
+               return NULL;
+
+       ifobj->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *));
+       if (!ifobj->xsk_arr)
+               goto out_xsk_arr;
+
+       ifobj->umem_arr = calloc(2, sizeof(struct xsk_umem_info *));
+       if (!ifobj->umem_arr)
+               goto out_umem_arr;
+
+       return ifobj;
+
+out_umem_arr:
+       free(ifobj->xsk_arr);
+out_xsk_arr:
+       free(ifobj);
+       return NULL;
+}
+
+static void ifobject_delete(struct ifobject *ifobj)
+{
+       free(ifobj->umem_arr);
+       free(ifobj->xsk_arr);
+       free(ifobj);
+}
+
 int main(int argc, char **argv)
 {
        struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY };
-       bool failure = false;
        int i, j;
 
        if (setrlimit(RLIMIT_MEMLOCK, &_rlim))
                exit_with_error(errno);
 
-       for (int i = 0; i < MAX_INTERFACES; i++) {
-               ifdict[i] = malloc(sizeof(struct ifobject));
+       for (i = 0; i < MAX_INTERFACES; i++) {
+               ifdict[i] = ifobject_create();
                if (!ifdict[i])
-                       exit_with_error(errno);
-
-               ifdict[i]->ifdict_index = i;
-               ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *));
-               if (!ifdict[i]->xsk_arr) {
-                       failure = true;
-                       goto cleanup;
-               }
-               ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *));
-               if (!ifdict[i]->umem_arr) {
-                       failure = true;
-                       goto cleanup;
-               }
+                       exit_with_error(ENOMEM);
        }
 
        setlocale(LC_ALL, "");
 
        parse_command_line(argc, argv);
 
-       num_frames = ++opt_pkt_count;
-
-       init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx);
-       init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx);
+       init_iface(ifdict[tx], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx);
+       init_iface(ifdict[rx], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx);
 
        ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
 
-       for (i = 0; i < TEST_MODE_MAX; i++) {
-               for (j = 0; j < TEST_TYPE_MAX; j++)
+       for (i = 0; i < TEST_MODE_MAX; i++)
+               for (j = 0; j < TEST_TYPE_MAX; j++) {
                        run_pkt_test(i, j);
-       }
-
-cleanup:
-       for (int i = 0; i < MAX_INTERFACES; i++) {
-               if (ifdict[i]->ns_fd != -1)
-                       close(ifdict[i]->ns_fd);
-               free(ifdict[i]->xsk_arr);
-               free(ifdict[i]->umem_arr);
-               free(ifdict[i]);
-       }
+                       usleep(USLEEP_MAX);
+               }
 
-       if (failure)
-               exit_with_error(errno);
+       for (i = 0; i < MAX_INTERFACES; i++)
+               ifobject_delete(ifdict[i]);
 
        ksft_exit_pass();
-
        return 0;
 }
index 6c428b2..7e49b9f 100644 (file)
 #define IP_PKT_TOS 0x9
 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr))
 #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr))
-#define EOT (-1)
-#define USLEEP_MAX 200000
+#define USLEEP_MAX 10000
 #define SOCK_RECONF_CTR 10
-#define BATCH_SIZE 64
+#define BATCH_SIZE 8
 #define POLL_TMOUT 1000
-#define DEFAULT_PKT_CNT 10000
+#define DEFAULT_PKT_CNT (4 * 1024)
 #define RX_FULL_RXQSIZE 32
+#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1)
 
 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0)
 
-typedef __u32 u32;
-typedef __u16 u16;
-typedef __u8 u8;
-
-enum TEST_MODES {
-       TEST_MODE_UNCONFIGURED = -1,
+enum test_mode {
        TEST_MODE_SKB,
        TEST_MODE_DRV,
        TEST_MODE_MAX
 };
 
-enum TEST_TYPES {
+enum test_type {
        TEST_TYPE_NOPOLL,
        TEST_TYPE_POLL,
        TEST_TYPE_TEARDOWN,
@@ -65,7 +60,7 @@ enum TEST_TYPES {
        TEST_TYPE_MAX
 };
 
-enum STAT_TEST_TYPES {
+enum stat_test_type {
        STAT_TEST_RX_DROPPED,
        STAT_TEST_TX_INVALID,
        STAT_TEST_RX_FULL,
@@ -73,21 +68,16 @@ enum STAT_TEST_TYPES {
        STAT_TEST_TYPE_MAX
 };
 
-static int configured_mode = TEST_MODE_UNCONFIGURED;
-static u8 debug_pkt_dump;
-static u32 num_frames;
+static int configured_mode;
+static bool opt_pkt_dump;
+static u32 num_frames = DEFAULT_PKT_CNT / 4;
 static bool second_step;
 static int test_type;
 
-static int opt_pkt_count;
-static u8 opt_verbose;
+static bool opt_verbose;
 
 static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY;
-static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
-static u32 pkt_counter;
-static long prev_pkt = -1;
-static int sigvar;
 static int stat_test_type;
 static u32 rxqsize;
 static u32 frame_headroom;
@@ -104,10 +94,6 @@ struct xsk_socket_info {
        struct xsk_ring_prod tx;
        struct xsk_umem_info *umem;
        struct xsk_socket *xsk;
-       unsigned long rx_npkts;
-       unsigned long tx_npkts;
-       unsigned long prev_rx_npkts;
-       unsigned long prev_tx_npkts;
        u32 outstanding_tx;
 };
 
@@ -118,8 +104,15 @@ struct flow_vector {
        } vector;
 };
 
-struct generic_data {
-       u32 seqnum;
+struct pkt {
+       u64 addr;
+       u32 len;
+       u32 payload;
+};
+
+struct pkt_stream {
+       u32 nb_pkts;
+       struct pkt *pkts;
 };
 
 struct ifobject {
@@ -131,8 +124,8 @@ struct ifobject {
        struct xsk_umem_info *umem;
        void *(*func_ptr)(void *arg);
        struct flow_vector fv;
+       struct pkt_stream *pkt_stream;
        int ns_fd;
-       int ifdict_index;
        u32 dst_ip;
        u32 src_ip;
        u16 src_port;
@@ -149,18 +142,4 @@ static struct ifobject *ifdict_tx;
 pthread_barrier_t barr;
 pthread_t t0, t1;
 
-TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head);
-struct head_s *head_p;
-struct pkt {
-       char *pkt_frame;
-
-       TAILQ_ENTRY(pkt) pkt_nodes;
-} *pkt_node_rx, *pkt_node_rx_q;
-
-struct pkt_frame {
-       char *payload;
-} *pkt_obj;
-
-struct pkt_frame **pkt_buf;
-
 #endif                         /* XDPXCEIVER_H */
index dac1c5f..bf29d25 100755 (executable)
@@ -8,14 +8,8 @@ ksft_xfail=2
 ksft_xpass=3
 ksft_skip=4
 
-GREEN='\033[0;92m'
-YELLOW='\033[0;93m'
-RED='\033[0;31m'
-NC='\033[0m'
-STACK_LIM=131072
 SPECFILE=veth.spec
 XSKOBJ=xdpxceiver
-NUMPKTS=10000
 
 validate_root_exec()
 {
@@ -50,22 +44,12 @@ validate_veth_spec_file()
 test_status()
 {
        statusval=$1
-       if [ -n "${colorconsole+set}" ]; then
-               if [ $statusval -eq 2 ]; then
-                       echo -e "${YELLOW}$2${NC}: [ ${RED}FAIL${NC} ]"
-               elif [ $statusval -eq 1 ]; then
-                       echo -e "${YELLOW}$2${NC}: [ ${RED}SKIPPED${NC} ]"
-               elif [ $statusval -eq 0 ]; then
-                       echo -e "${YELLOW}$2${NC}: [ ${GREEN}PASS${NC} ]"
-               fi
-       else
-               if [ $statusval -eq 2 ]; then
-                       echo -e "$2: [ FAIL ]"
-               elif [ $statusval -eq 1 ]; then
-                       echo -e "$2: [ SKIPPED ]"
-               elif [ $statusval -eq 0 ]; then
-                       echo -e "$2: [ PASS ]"
-               fi
+       if [ $statusval -eq 2 ]; then
+               echo -e "$2: [ FAIL ]"
+       elif [ $statusval -eq 1 ]; then
+               echo -e "$2: [ SKIPPED ]"
+       elif [ $statusval -eq 0 ]; then
+               echo -e "$2: [ PASS ]"
        fi
 }
 
@@ -107,5 +91,5 @@ validate_ip_utility()
 
 execxdpxceiver()
 {
-       ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG}
+       ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${VERBOSE_ARG} ${DUMP_PKTS_ARG}
 }