netfilter: flowtable: use dev_fill_forward_path() to obtain egress device
authorPablo Neira Ayuso <pablo@netfilter.org>
Wed, 24 Mar 2021 01:30:40 +0000 (02:30 +0100)
committerDavid S. Miller <davem@davemloft.net>
Wed, 24 Mar 2021 19:48:39 +0000 (12:48 -0700)
The egress device in the tuple is obtained from route. Use
dev_fill_forward_path() instead to provide the real egress device for
this flow whenever this is available.

The new FLOW_OFFLOAD_XMIT_DIRECT type uses dev_queue_xmit() to transmit
ethernet frames. Cache the source and destination hardware address to
use dev_queue_xmit() to transfer packets.

The FLOW_OFFLOAD_XMIT_DIRECT replaces FLOW_OFFLOAD_XMIT_NEIGH if
dev_fill_forward_path() finds a direct transmit path.

In case of topology updates, if peer is moved to different bridge port,
the connection will time out, reconnect will result in a new entry with
the correct path. Snooping fdb updates would allow for cleaning up stale
flowtable entries.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/netfilter/nf_flow_table.h
net/netfilter/nf_flow_table_core.c
net/netfilter/nf_flow_table_ip.c
net/netfilter/nft_flow_offload.c

index dca9fc6..41c8436 100644 (file)
@@ -92,6 +92,7 @@ enum flow_offload_tuple_dir {
 enum flow_offload_xmit_type {
        FLOW_OFFLOAD_XMIT_NEIGH         = 0,
        FLOW_OFFLOAD_XMIT_XFRM,
+       FLOW_OFFLOAD_XMIT_DIRECT,
 };
 
 struct flow_offload_tuple {
@@ -120,8 +121,14 @@ struct flow_offload_tuple {
                                        xmit_type:2;
 
        u16                             mtu;
-
-       struct dst_entry                *dst_cache;
+       union {
+               struct dst_entry        *dst_cache;
+               struct {
+                       u32             ifidx;
+                       u8              h_source[ETH_ALEN];
+                       u8              h_dest[ETH_ALEN];
+               } out;
+       };
 };
 
 struct flow_offload_tuple_rhash {
@@ -168,6 +175,11 @@ struct nf_flow_route {
                struct {
                        u32                     ifindex;
                } in;
+               struct {
+                       u32                     ifindex;
+                       u8                      h_source[ETH_ALEN];
+                       u8                      h_dest[ETH_ALEN];
+               } out;
                enum flow_offload_xmit_type     xmit_type;
        } tuple[FLOW_OFFLOAD_DIR_MAX];
 };
index 51e3e1b..a92acb3 100644 (file)
@@ -81,9 +81,6 @@ static int flow_offload_fill_route(struct flow_offload *flow,
        struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
        struct dst_entry *dst = route->tuple[dir].dst;
 
-       if (!dst_hold_safe(route->tuple[dir].dst))
-               return -1;
-
        switch (flow_tuple->l3proto) {
        case NFPROTO_IPV4:
                flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
@@ -94,12 +91,36 @@ static int flow_offload_fill_route(struct flow_offload *flow,
        }
 
        flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+
+       switch (route->tuple[dir].xmit_type) {
+       case FLOW_OFFLOAD_XMIT_DIRECT:
+               memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+                      ETH_ALEN);
+               memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+                      ETH_ALEN);
+               flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+               break;
+       case FLOW_OFFLOAD_XMIT_XFRM:
+       case FLOW_OFFLOAD_XMIT_NEIGH:
+               if (!dst_hold_safe(route->tuple[dir].dst))
+                       return -1;
+
+               flow_tuple->dst_cache = dst;
+               break;
+       }
        flow_tuple->xmit_type = route->tuple[dir].xmit_type;
-       flow_tuple->dst_cache = dst;
 
        return 0;
 }
 
+static void nft_flow_dst_release(struct flow_offload *flow,
+                                enum flow_offload_tuple_dir dir)
+{
+       if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+           flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+               dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
+
 int flow_offload_route_init(struct flow_offload *flow,
                            const struct nf_flow_route *route)
 {
@@ -118,7 +139,7 @@ int flow_offload_route_init(struct flow_offload *flow,
        return 0;
 
 err_route_reply:
-       dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+       nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
 
        return err;
 }
@@ -169,8 +190,8 @@ static void flow_offload_fixup_ct(struct nf_conn *ct)
 
 static void flow_offload_route_release(struct flow_offload *flow)
 {
-       dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
-       dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+       nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+       nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
 }
 
 void flow_offload_free(struct flow_offload *flow)
index e9bef38..9e84767 100644 (file)
@@ -207,6 +207,24 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
        return NF_STOLEN;
 }
 
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+                                      const struct flow_offload_tuple_rhash *tuplehash,
+                                      unsigned short type)
+{
+       struct net_device *outdev;
+
+       outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx);
+       if (!outdev)
+               return NF_DROP;
+
+       skb->dev = outdev;
+       dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest,
+                       tuplehash->tuple.out.h_source, skb->len);
+       dev_queue_xmit(skb);
+
+       return NF_STOLEN;
+}
+
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
                        const struct nf_hook_state *state)
@@ -222,6 +240,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
        struct iphdr *iph;
        __be32 nexthop;
        u32 hdrsize;
+       int ret;
 
        if (skb->protocol != htons(ETH_P_IP))
                return NF_ACCEPT;
@@ -244,9 +263,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
        if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
                return NF_ACCEPT;
 
-       if (!dst_check(&rt->dst, 0)) {
-               flow_offload_teardown(flow);
-               return NF_ACCEPT;
+       if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+           tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+               rt = (struct rtable *)tuplehash->tuple.dst_cache;
+               if (!dst_check(&rt->dst, 0)) {
+                       flow_offload_teardown(flow);
+                       return NF_ACCEPT;
+               }
        }
 
        if (skb_try_make_writable(skb, thoff + hdrsize))
@@ -263,8 +286,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
        if (flow_table->flags & NF_FLOWTABLE_COUNTER)
                nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
 
-       rt = (struct rtable *)tuplehash->tuple.dst_cache;
-
        if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
                memset(skb->cb, 0, sizeof(struct inet_skb_parm));
                IPCB(skb)->iif = skb->dev->ifindex;
@@ -272,13 +293,23 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
                return nf_flow_xmit_xfrm(skb, state, &rt->dst);
        }
 
-       outdev = rt->dst.dev;
-       skb->dev = outdev;
-       nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
-       skb_dst_set_noref(skb, &rt->dst);
-       neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+       switch (tuplehash->tuple.xmit_type) {
+       case FLOW_OFFLOAD_XMIT_NEIGH:
+               outdev = rt->dst.dev;
+               skb->dev = outdev;
+               nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+               skb_dst_set_noref(skb, &rt->dst);
+               neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+               ret = NF_STOLEN;
+               break;
+       case FLOW_OFFLOAD_XMIT_DIRECT:
+               ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP);
+               if (ret == NF_DROP)
+                       flow_offload_teardown(flow);
+               break;
+       }
 
-       return NF_STOLEN;
+       return ret;
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
 
@@ -444,6 +475,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
        struct ipv6hdr *ip6h;
        struct rt6_info *rt;
        u32 hdrsize;
+       int ret;
 
        if (skb->protocol != htons(ETH_P_IPV6))
                return NF_ACCEPT;
@@ -465,9 +497,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
                                sizeof(*ip6h)))
                return NF_ACCEPT;
 
-       if (!dst_check(&rt->dst, 0)) {
-               flow_offload_teardown(flow);
-               return NF_ACCEPT;
+       if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+           tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+               rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+               if (!dst_check(&rt->dst, 0)) {
+                       flow_offload_teardown(flow);
+                       return NF_ACCEPT;
+               }
        }
 
        if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize))
@@ -484,8 +520,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
        if (flow_table->flags & NF_FLOWTABLE_COUNTER)
                nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
 
-       rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
-
        if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
                memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
                IP6CB(skb)->iif = skb->dev->ifindex;
@@ -493,12 +527,22 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
                return nf_flow_xmit_xfrm(skb, state, &rt->dst);
        }
 
-       outdev = rt->dst.dev;
-       skb->dev = outdev;
-       nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
-       skb_dst_set_noref(skb, &rt->dst);
-       neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+       switch (tuplehash->tuple.xmit_type) {
+       case FLOW_OFFLOAD_XMIT_NEIGH:
+               outdev = rt->dst.dev;
+               skb->dev = outdev;
+               nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+               skb_dst_set_noref(skb, &rt->dst);
+               neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+               ret = NF_STOLEN;
+               break;
+       case FLOW_OFFLOAD_XMIT_DIRECT:
+               ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6);
+               if (ret == NF_DROP)
+                       flow_offload_teardown(flow);
+               break;
+       }
 
-       return NF_STOLEN;
+       return ret;
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
index 15f90c3..a6595dc 100644 (file)
@@ -39,12 +39,11 @@ static void nft_default_forward_path(struct nf_flow_route *route,
 static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
                                     const struct dst_entry *dst_cache,
                                     const struct nf_conn *ct,
-                                    enum ip_conntrack_dir dir,
+                                    enum ip_conntrack_dir dir, u8 *ha,
                                     struct net_device_path_stack *stack)
 {
        const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
        struct net_device *dev = dst_cache->dev;
-       unsigned char ha[ETH_ALEN];
        struct neighbour *n;
        u8 nud_state;
 
@@ -66,27 +65,43 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
 
 struct nft_forward_info {
        const struct net_device *indev;
+       const struct net_device *outdev;
+       u8 h_source[ETH_ALEN];
+       u8 h_dest[ETH_ALEN];
+       enum flow_offload_xmit_type xmit_type;
 };
 
 static void nft_dev_path_info(const struct net_device_path_stack *stack,
-                             struct nft_forward_info *info)
+                             struct nft_forward_info *info,
+                             unsigned char *ha)
 {
        const struct net_device_path *path;
        int i;
 
+       memcpy(info->h_dest, ha, ETH_ALEN);
+
        for (i = 0; i < stack->num_paths; i++) {
                path = &stack->path[i];
                switch (path->type) {
                case DEV_PATH_ETHERNET:
                        info->indev = path->dev;
+                       if (is_zero_ether_addr(info->h_source))
+                               memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
                        break;
-               case DEV_PATH_VLAN:
                case DEV_PATH_BRIDGE:
+                       if (is_zero_ether_addr(info->h_source))
+                               memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+                       info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+                       break;
+               case DEV_PATH_VLAN:
                default:
                        info->indev = NULL;
                        break;
                }
        }
+       if (!info->outdev)
+               info->outdev = info->indev;
 }
 
 static bool nft_flowtable_find_dev(const struct net_device *dev,
@@ -114,14 +129,22 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
        const struct dst_entry *dst = route->tuple[dir].dst;
        struct net_device_path_stack stack;
        struct nft_forward_info info = {};
+       unsigned char ha[ETH_ALEN];
 
-       if (nft_dev_fill_forward_path(route, dst, ct, dir, &stack) >= 0)
-               nft_dev_path_info(&stack, &info);
+       if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+               nft_dev_path_info(&stack, &info, ha);
 
        if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
                return;
 
        route->tuple[!dir].in.ifindex = info.indev->ifindex;
+
+       if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+               memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+               memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+               route->tuple[dir].out.ifindex = info.outdev->ifindex;
+               route->tuple[dir].xmit_type = info.xmit_type;
+       }
 }
 
 static int nft_flow_route(const struct nft_pktinfo *pkt,