bpf: Add BPF_FIB_LOOKUP_SKIP_NEIGH for bpf_fib_lookup
authorMartin KaFai Lau <martin.lau@kernel.org>
Fri, 17 Feb 2023 20:55:14 +0000 (12:55 -0800)
committerDaniel Borkmann <daniel@iogearbox.net>
Fri, 17 Feb 2023 21:12:04 +0000 (22:12 +0100)
The bpf_fib_lookup() also looks up the neigh table.
This was done before bpf_redirect_neigh() was added.

In the use case that does not manage the neigh table
and requires bpf_fib_lookup() to lookup a fib to
decide if it needs to redirect or not, the bpf prog can
depend only on using bpf_redirect_neigh() to lookup the
neigh. It also keeps the neigh entries fresh and connected.

This patch adds a bpf_fib_lookup flag, SKIP_NEIGH, to avoid
the double neigh lookup when the bpf prog always call
bpf_redirect_neigh() to do the neigh lookup. The params->smac
output is skipped together when SKIP_NEIGH is set because
bpf_redirect_neigh() will figure out the smac also.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230217205515.3583372-1-martin.lau@linux.dev
include/uapi/linux/bpf.h
net/core/filter.c
tools/include/uapi/linux/bpf.h

index 1503f61336b613e81cdf315518c6c20a10aad557..62ce1f5d1b1d9ae8e49b8d17e1c7cfaae3068a2a 100644 (file)
@@ -3134,6 +3134,11 @@ union bpf_attr {
  *             **BPF_FIB_LOOKUP_OUTPUT**
  *                     Perform lookup from an egress perspective (default is
  *                     ingress).
+ *             **BPF_FIB_LOOKUP_SKIP_NEIGH**
+ *                     Skip the neighbour table lookup. *params*->dmac
+ *                     and *params*->smac will not be set as output. A common
+ *                     use case is to call **bpf_redirect_neigh**\ () after
+ *                     doing **bpf_fib_lookup**\ ().
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
@@ -6750,6 +6755,7 @@ struct bpf_raw_tracepoint_args {
 enum {
        BPF_FIB_LOOKUP_DIRECT  = (1U << 0),
        BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
+       BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 };
 
 enum {
index 8daaaf76ab15047f8cf13ff073d41a2737707fe2..1d6f165923bffd968d067aadbfe59702f9b0a1be 100644 (file)
@@ -5722,12 +5722,8 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 #endif
 
 #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
-static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
-                                 const struct neighbour *neigh,
-                                 const struct net_device *dev, u32 mtu)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
 {
-       memcpy(params->dmac, neigh->ha, ETH_ALEN);
-       memcpy(params->smac, dev->dev_addr, ETH_ALEN);
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
        if (mtu)
@@ -5838,21 +5834,29 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
        if (likely(nhc->nhc_gw_family != AF_INET6)) {
                if (nhc->nhc_gw_family)
                        params->ipv4_dst = nhc->nhc_gw.ipv4;
-
-               neigh = __ipv4_neigh_lookup_noref(dev,
-                                                (__force u32)params->ipv4_dst);
        } else {
                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
 
                params->family = AF_INET6;
                *dst = nhc->nhc_gw.ipv6;
-               neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        }
 
+       if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+               goto set_fwd_params;
+
+       if (likely(nhc->nhc_gw_family != AF_INET6))
+               neigh = __ipv4_neigh_lookup_noref(dev,
+                                                 (__force u32)params->ipv4_dst);
+       else
+               neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
+
        if (!neigh || !(neigh->nud_state & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
+       memcpy(params->dmac, neigh->ha, ETH_ALEN);
+       memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 
-       return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
+set_fwd_params:
+       return bpf_fib_set_fwd_params(params, mtu);
 }
 #endif
 
@@ -5960,24 +5964,33 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
        params->rt_metric = res.f6i->fib6_metric;
        params->ifindex = dev->ifindex;
 
+       if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+               goto set_fwd_params;
+
        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
         */
        neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        if (!neigh || !(neigh->nud_state & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
+       memcpy(params->dmac, neigh->ha, ETH_ALEN);
+       memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 
-       return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
+set_fwd_params:
+       return bpf_fib_set_fwd_params(params, mtu);
 }
 #endif
 
+#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
+                            BPF_FIB_LOOKUP_SKIP_NEIGH)
+
 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
        if (plen < sizeof(*params))
                return -EINVAL;
 
-       if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+       if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;
 
        switch (params->family) {
@@ -6015,7 +6028,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
        if (plen < sizeof(*params))
                return -EINVAL;
 
-       if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+       if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;
 
        if (params->tot_len)
index 1503f61336b613e81cdf315518c6c20a10aad557..62ce1f5d1b1d9ae8e49b8d17e1c7cfaae3068a2a 100644 (file)
@@ -3134,6 +3134,11 @@ union bpf_attr {
  *             **BPF_FIB_LOOKUP_OUTPUT**
  *                     Perform lookup from an egress perspective (default is
  *                     ingress).
+ *             **BPF_FIB_LOOKUP_SKIP_NEIGH**
+ *                     Skip the neighbour table lookup. *params*->dmac
+ *                     and *params*->smac will not be set as output. A common
+ *                     use case is to call **bpf_redirect_neigh**\ () after
+ *                     doing **bpf_fib_lookup**\ ().
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
@@ -6750,6 +6755,7 @@ struct bpf_raw_tracepoint_args {
 enum {
        BPF_FIB_LOOKUP_DIRECT  = (1U << 0),
        BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
+       BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 };
 
 enum {