bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c

author Peter Oskolkov <posk@google.com>

Wed, 13 Feb 2019 19:53:39 +0000 (11:53 -0800)

committer Alexei Starovoitov <ast@kernel.org>

Thu, 14 Feb 2019 02:27:55 +0000 (18:27 -0800)
author Peter Oskolkov <posk@google.com>
Wed, 13 Feb 2019 19:53:39 +0000 (11:53 -0800)
committer Alexei Starovoitov <ast@kernel.org>
Thu, 14 Feb 2019 02:27:55 +0000 (18:27 -0800)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c

index 079871f..32251f3 100644 (file)
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -17,6 +17,7 @@
  #include <linux/bpf.h>
  #include <net/lwtunnel.h>
  #include <net/gre.h>
+#include <net/ip6_route.h>
  
  struct bpf_lwt_prog {
         struct bpf_prog *prog;
@@ -56,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
  
         switch (ret) {
         case BPF_OK:
+       case BPF_LWT_REROUTE:
                 break;
  
         case BPF_REDIRECT:
@@ -88,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
         return ret;
  }
  
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+       int err = -EINVAL;
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               struct iphdr *iph = ip_hdr(skb);
+
+               err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+                                          iph->tos, skb_dst(skb)->dev);
+       } else if (skb->protocol == htons(ETH_P_IPV6)) {
+               err = ipv6_stub->ipv6_route_input(skb);
+       } else {
+               err = -EAFNOSUPPORT;
+       }
+
+       if (err)
+               goto err;
+       return dst_input(skb);
+
+err:
+       kfree_skb(skb);
+       return err;
+}
+
  static int bpf_input(struct sk_buff *skb)
  {
         struct dst_entry *dst = skb_dst(skb);
@@ -99,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
                 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
                 if (ret < 0)
                         return ret;
+               if (ret == BPF_LWT_REROUTE)
+                       return bpf_lwt_input_reroute(skb);
         }
  
         if (unlikely(!dst->lwtstate->orig_input)) {
-               pr_warn_once("orig_input not set on dst for prog %s\n",
-                            bpf->out.name);
                 kfree_skb(skb);
                 return -EINVAL;
         }
@@ -148,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb)
         return 0;
  }
  
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+       struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+       int oif = l3mdev ? l3mdev->ifindex : 0;
+       struct dst_entry *dst = NULL;
+       struct sock *sk;
+       struct net *net;
+       bool ipv4;
+       int err;
+
+       if (skb->protocol == htons(ETH_P_IP))
+               ipv4 = true;
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               ipv4 = false;
+       else
+               return -EAFNOSUPPORT;
+
+       sk = sk_to_full_sk(skb->sk);
+       if (sk) {
+               if (sk->sk_bound_dev_if)
+                       oif = sk->sk_bound_dev_if;
+               net = sock_net(sk);
+       } else {
+               net = dev_net(skb_dst(skb)->dev);
+       }
+
+       if (ipv4) {
+               struct iphdr *iph = ip_hdr(skb);
+               struct flowi4 fl4 = {};
+               struct rtable *rt;
+
+               fl4.flowi4_oif = oif;
+               fl4.flowi4_mark = skb->mark;
+               fl4.flowi4_uid = sock_net_uid(net, sk);
+               fl4.flowi4_tos = RT_TOS(iph->tos);
+               fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+               fl4.flowi4_proto = iph->protocol;
+               fl4.daddr = iph->daddr;
+               fl4.saddr = iph->saddr;
+
+               rt = ip_route_output_key(net, &fl4);
+               if (IS_ERR(rt))
+                       return -EINVAL;
+               dst = &rt->dst;
+       } else {
+               struct ipv6hdr *iph6 = ipv6_hdr(skb);
+               struct flowi6 fl6 = {};
+
+               fl6.flowi6_oif = oif;
+               fl6.flowi6_mark = skb->mark;
+               fl6.flowi6_uid = sock_net_uid(net, sk);
+               fl6.flowlabel = ip6_flowinfo(iph6);
+               fl6.flowi6_proto = iph6->nexthdr;
+               fl6.daddr = iph6->daddr;
+               fl6.saddr = iph6->saddr;
+
+               err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
+               if (err || IS_ERR(dst))
+                       return -EINVAL;
+       }
+       if (unlikely(dst->error)) {
+               dst_release(dst);
+               return -EINVAL;
+       }
+
+       /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+        * was done for the previous dst, so we are doing it here again, in
+        * case the new dst needs much more space. The call below is a noop
+        * if there is enough header space in skb.
+        */
+       err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+       if (unlikely(err))
+               return err;
+
+       skb_dst_drop(skb);
+       skb_dst_set(skb, dst);
+
+       err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+       if (unlikely(err))
+               return err;
+
+       /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+       return LWTUNNEL_XMIT_DONE;
+}
+
  static int bpf_xmit(struct sk_buff *skb)
  {
         struct dst_entry *dst = skb_dst(skb);
@@ -155,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb)
  
         bpf = bpf_lwt_lwtunnel(dst->lwtstate);
         if (bpf->xmit.prog) {
+               __be16 proto = skb->protocol;
                 int ret;
  
                 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
                 switch (ret) {
                 case BPF_OK:
+                       /* If the header changed, e.g. via bpf_lwt_push_encap,
+                        * BPF_LWT_REROUTE below should have been used if the
+                        * protocol was also changed.
+                        */
+                       if (skb->protocol != proto) {
+                               kfree_skb(skb);
+                               return -EINVAL;
+                       }
                         /* If the header was expanded, headroom might be too
                          * small for L2 header to come, expand as needed.
                          */
@@ -170,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb)
                         return LWTUNNEL_XMIT_CONTINUE;
                 case BPF_REDIRECT:
                         return LWTUNNEL_XMIT_DONE;
+               case BPF_LWT_REROUTE:
+                       return bpf_lwt_xmit_reroute(skb);
                 default:
                         return ret;
                 }
author	Peter Oskolkov <posk@google.com>
	Wed, 13 Feb 2019 19:53:39 +0000 (11:53 -0800)
committer	Alexei Starovoitov <ast@kernel.org>
	Thu, 14 Feb 2019 02:27:55 +0000 (18:27 -0800)