net/netfilter: Add unstable CT lookup helpers for XDP and TC-BPF
authorKumar Kartikeya Dwivedi <memxor@gmail.com>
Fri, 14 Jan 2022 16:39:49 +0000 (22:09 +0530)
committerAlexei Starovoitov <ast@kernel.org>
Tue, 18 Jan 2022 22:26:42 +0000 (14:26 -0800)
This change adds conntrack lookup helpers using the unstable kfunc call
interface for the XDP and TC-BPF hooks. The primary usecase is
implementing a synproxy in XDP, see Maxim's patchset [0].

Export get_net_ns_by_id as nf_conntrack_bpf.c needs to call it.

This object is only built when CONFIG_DEBUG_INFO_BTF_MODULES is enabled.

  [0]: https://lore.kernel.org/bpf/20211019144655.3483197-1-maximmi@nvidia.com

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
include/net/netfilter/nf_conntrack_bpf.h [new file with mode: 0644]
net/core/net_namespace.c
net/netfilter/Makefile
net/netfilter/nf_conntrack_bpf.c [new file with mode: 0644]
net/netfilter/nf_conntrack_core.c

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
new file mode 100644 (file)
index 0000000..a473b56
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _NF_CONNTRACK_BPF_H
+#define _NF_CONNTRACK_BPF_H
+
+#include <linux/btf.h>
+#include <linux/kconfig.h>
+
+#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_nf_conntrack_bpf(void);
+
+#else
+
+static inline int register_nf_conntrack_bpf(void)
+{
+       return 0;
+}
+
+#endif
+
+#endif /* _NF_CONNTRACK_BPF_H */
index 9b7171c40434985b869c1477975fc75447d78c3b..3b471781327fd7b8e4e2122a324346f7613bbc39 100644 (file)
@@ -299,6 +299,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
 
        return peer;
 }
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
 
 /*
  * setup_net runs the initializers for the network namespace object.
index a135b1a46014c24a0d611a6dadc7c20b8f7a8cd9..238b6a620e888911e9a0034cdb42909079eab056 100644 (file)
@@ -14,6 +14,11 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644 (file)
index 0000000..8ad3f52
--- /dev/null
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id   - Specify the network namespace for lookup
+ *              Values:
+ *                BPF_F_CURRENT_NETNS (-1)
+ *                  Use namespace associated with ctx (xdp_md, __sk_buff)
+ *                [0, S32_MAX]
+ *                  Network Namespace ID
+ * @error      - Out parameter, set for any errors encountered
+ *              Values:
+ *                -EINVAL - Passed NULL for bpf_tuple pointer
+ *                -EINVAL - opts->reserved is not 0
+ *                -EINVAL - netns_id is less than -1
+ *                -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ *                -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ *                -ENONET - No network namespace found for netns_id
+ *                -ENOENT - Conntrack lookup could not find entry for tuple
+ *                -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ *                                or sizeof(tuple->ipv6)
+ * @l4proto    - Layer 4 protocol
+ *              Values:
+ *                IPPROTO_TCP, IPPROTO_UDP
+ * @reserved   - Reserved member, will be reused for more options in future
+ *              Values:
+ *                0
+ */
+struct bpf_ct_opts {
+       s32 netns_id;
+       s32 error;
+       u8 l4proto;
+       u8 reserved[3];
+};
+
+enum {
+       NF_BPF_CT_OPTS_SZ = 12,
+};
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+                                         struct bpf_sock_tuple *bpf_tuple,
+                                         u32 tuple_len, u8 protonum,
+                                         s32 netns_id)
+{
+       struct nf_conntrack_tuple_hash *hash;
+       struct nf_conntrack_tuple tuple;
+
+       if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+               return ERR_PTR(-EPROTO);
+       if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
+               return ERR_PTR(-EINVAL);
+
+       memset(&tuple, 0, sizeof(tuple));
+       switch (tuple_len) {
+       case sizeof(bpf_tuple->ipv4):
+               tuple.src.l3num = AF_INET;
+               tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
+               tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
+               tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
+               tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
+               break;
+       case sizeof(bpf_tuple->ipv6):
+               tuple.src.l3num = AF_INET6;
+               memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+               tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
+               memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+               tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
+               break;
+       default:
+               return ERR_PTR(-EAFNOSUPPORT);
+       }
+
+       tuple.dst.protonum = protonum;
+
+       if (netns_id >= 0) {
+               net = get_net_ns_by_id(net, netns_id);
+               if (unlikely(!net))
+                       return ERR_PTR(-ENONET);
+       }
+
+       hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+       if (netns_id >= 0)
+               put_net(net);
+       if (!hash)
+               return ERR_PTR(-ENOENT);
+       return nf_ct_tuplehash_to_ctrack(hash);
+}
+
+__diag_push();
+__diag_ignore(GCC, 8, "-Wmissing-prototypes",
+             "Global functions as their definitions will be in nf_conntrack BTF");
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ *                    reference to it
+ *
+ * Parameters:
+ * @xdp_ctx    - Pointer to ctx (xdp_md) in XDP program
+ *                 Cannot be NULL
+ * @bpf_tuple  - Pointer to memory representing the tuple to look up
+ *                 Cannot be NULL
+ * @tuple__sz  - Length of the tuple structure
+ *                 Must be one of sizeof(bpf_tuple->ipv4) or
+ *                 sizeof(bpf_tuple->ipv6)
+ * @opts       - Additional options for lookup (documented above)
+ *                 Cannot be NULL
+ * @opts__sz   - Length of the bpf_ct_opts structure
+ *                 Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+                 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+       struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+       struct net *caller_net;
+       struct nf_conn *nfct;
+
+       BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+
+       if (!opts)
+               return NULL;
+       if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+           opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+               opts->error = -EINVAL;
+               return NULL;
+       }
+       caller_net = dev_net(ctx->rxq->dev);
+       nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
+                                 opts->netns_id);
+       if (IS_ERR(nfct)) {
+               opts->error = PTR_ERR(nfct);
+               return NULL;
+       }
+       return nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ *                    reference to it
+ *
+ * Parameters:
+ * @skb_ctx    - Pointer to ctx (__sk_buff) in TC program
+ *                 Cannot be NULL
+ * @bpf_tuple  - Pointer to memory representing the tuple to look up
+ *                 Cannot be NULL
+ * @tuple__sz  - Length of the tuple structure
+ *                 Must be one of sizeof(bpf_tuple->ipv4) or
+ *                 sizeof(bpf_tuple->ipv6)
+ * @opts       - Additional options for lookup (documented above)
+ *                 Cannot be NULL
+ * @opts__sz   - Length of the bpf_ct_opts structure
+ *                 Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+                 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+       struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+       struct net *caller_net;
+       struct nf_conn *nfct;
+
+       BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+
+       if (!opts)
+               return NULL;
+       if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+           opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+               opts->error = -EINVAL;
+               return NULL;
+       }
+       caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+       nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
+                                 opts->netns_id);
+       if (IS_ERR(nfct)) {
+               opts->error = PTR_ERR(nfct);
+               return NULL;
+       }
+       return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn     - Pointer to referenced nf_conn object, obtained using
+ *                bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+void bpf_ct_release(struct nf_conn *nfct)
+{
+       if (!nfct)
+               return;
+       nf_ct_put(nfct);
+}
+
+__diag_pop()
+
+BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
+
+BTF_SET_START(nf_ct_tc_check_kfunc_ids)
+BTF_ID(func, bpf_skb_ct_lookup)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_tc_check_kfunc_ids)
+
+BTF_SET_START(nf_ct_acquire_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_skb_ct_lookup)
+BTF_SET_END(nf_ct_acquire_kfunc_ids)
+
+BTF_SET_START(nf_ct_release_kfunc_ids)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_release_kfunc_ids)
+
+/* Both sets are identical */
+#define nf_ct_ret_null_kfunc_ids nf_ct_acquire_kfunc_ids
+
+static const struct btf_kfunc_id_set nf_conntrack_xdp_kfunc_set = {
+       .owner        = THIS_MODULE,
+       .check_set    = &nf_ct_xdp_check_kfunc_ids,
+       .acquire_set  = &nf_ct_acquire_kfunc_ids,
+       .release_set  = &nf_ct_release_kfunc_ids,
+       .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+};
+
+static const struct btf_kfunc_id_set nf_conntrack_tc_kfunc_set = {
+       .owner        = THIS_MODULE,
+       .check_set    = &nf_ct_tc_check_kfunc_ids,
+       .acquire_set  = &nf_ct_acquire_kfunc_ids,
+       .release_set  = &nf_ct_release_kfunc_ids,
+       .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+       int ret;
+
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_xdp_kfunc_set);
+       return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_tc_kfunc_set);
+}
index 894a325d39f2028ed899feec8cb9e4bd2199eca7..3b60fca619208b5e9f1b9555587ed0ce304c19ca 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/rculist_nulls.h>
 
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
@@ -2748,8 +2749,15 @@ int nf_conntrack_init_start(void)
        conntrack_gc_work_init(&conntrack_gc_work);
        queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
 
+       ret = register_nf_conntrack_bpf();
+       if (ret < 0)
+               goto err_kfunc;
+
        return 0;
 
+err_kfunc:
+       cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+       nf_conntrack_proto_fini();
 err_proto:
        nf_conntrack_seqadj_fini();
 err_seqadj: