net/ipv6: Add support for path selection using hash of 5-tuple
authorDavid Ahern <dsahern@gmail.com>
Fri, 2 Mar 2018 16:32:18 +0000 (08:32 -0800)
committerDavid S. Miller <davem@davemloft.net>
Sun, 4 Mar 2018 18:04:23 +0000 (13:04 -0500)
Some operators prefer IPv6 path selection to use a standard 5-tuple
hash rather than just an L3 hash with the flow the label. To that end
add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb
("net: ipv4: add support for ECMP hash policy choice"). The default
is still L3 which covers source and destination addresses along with
flow label and IPv6 protocol.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/ip-sysctl.txt
include/net/ip6_route.h
include/net/netevent.h
include/net/netns/ipv6.h
net/ipv6/icmp.c
net/ipv6/route.c
net/ipv6/sysctl_net_ipv6.c

index a553d4e..783675a 100644 (file)
@@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN
        FALSE: disabled
        Default: FALSE
 
+fib_multipath_hash_policy - INTEGER
+       Controls which hash policy to use for multipath routes.
+       Default: 0 (Layer 3)
+       Possible values:
+       0 - Layer 3 (source and destination addresses plus flow label)
+       1 - Layer 4 (standard 5-tuple)
+
 anycast_src_echo_reply - BOOLEAN
        Controls the use of anycast addresses as source addresses for ICMPv6
        echo reply
index 9594f93..ce2abc0 100644 (file)
@@ -130,8 +130,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int flags);
-u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
-                      struct flow_keys *hkeys);
+u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
+                      const struct sk_buff *skb, struct flow_keys *hkeys);
 
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
index baee605..d991826 100644 (file)
@@ -27,6 +27,7 @@ enum netevent_notif_type {
        NETEVENT_REDIRECT,         /* arg is struct netevent_redirect ptr */
        NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
        NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
+       NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */
 };
 
 int register_netevent_notifier(struct notifier_block *nb);
index e286fda..5b51110 100644 (file)
@@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 {
        int ip6_rt_gc_elasticity;
        int ip6_rt_mtu_expires;
        int ip6_rt_min_advmss;
+       int multipath_hash_policy;
        int flowlabel_consistency;
        int auto_flowlabels;
        int icmpv6_time;
index a5d9292..6f84668 100644 (file)
@@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
        fl6.fl6_icmp_type = type;
        fl6.fl6_icmp_code = code;
        fl6.flowi6_uid = sock_net_uid(net, NULL);
-       fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL);
+       fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
        security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
        sk = icmpv6_xmit_lock(net);
index d2b8368..f0ae584 100644 (file)
@@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt)
        return false;
 }
 
-static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+static struct rt6_info *rt6_multipath_select(const struct net *net,
+                                            struct rt6_info *match,
                                             struct flowi6 *fl6, int oif,
                                             const struct sk_buff *skb,
                                             int strict)
@@ -461,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
         * case it will always be non-zero. Otherwise now is the time to do it.
         */
        if (!fl6->mp_hash)
-               fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL);
+               fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
        if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
                return match;
@@ -932,7 +933,7 @@ restart:
                rt = rt6_device_match(net, rt, &fl6->saddr,
                                      fl6->flowi6_oif, flags);
                if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-                       rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif,
+                       rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
                                                  skb, flags);
        }
        if (rt == net->ipv6.ip6_null_entry) {
@@ -1674,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
        rt = rt6_select(net, fn, oif, strict);
        if (rt->rt6i_nsiblings)
-               rt = rt6_multipath_select(rt, fl6, oif, skb, strict);
+               rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
        if (rt == net->ipv6.ip6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
@@ -1839,21 +1840,56 @@ out:
 }
 
 /* if skb is set it will be used and fl6 can be NULL */
-u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
-                      struct flow_keys *flkeys)
+u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
+                      const struct sk_buff *skb, struct flow_keys *flkeys)
 {
        struct flow_keys hash_keys;
        u32 mhash;
 
-       memset(&hash_keys, 0, sizeof(hash_keys));
-       hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-       if (skb) {
-               ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
-       } else {
-               hash_keys.addrs.v6addrs.src = fl6->saddr;
-               hash_keys.addrs.v6addrs.dst = fl6->daddr;
-               hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
-               hash_keys.basic.ip_proto = fl6->flowi6_proto;
+       switch (net->ipv6.sysctl.multipath_hash_policy) {
+       case 0:
+               memset(&hash_keys, 0, sizeof(hash_keys));
+               hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+               if (skb) {
+                       ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+               } else {
+                       hash_keys.addrs.v6addrs.src = fl6->saddr;
+                       hash_keys.addrs.v6addrs.dst = fl6->daddr;
+                       hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+                       hash_keys.basic.ip_proto = fl6->flowi6_proto;
+               }
+               break;
+       case 1:
+               if (skb) {
+                       unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+                       struct flow_keys keys;
+
+                       /* short-circuit if we already have L4 hash present */
+                       if (skb->l4_hash)
+                               return skb_get_hash_raw(skb) >> 1;
+
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+
+                        if (!flkeys) {
+                               skb_flow_dissect_flow_keys(skb, &keys, flag);
+                               flkeys = &keys;
+                       }
+                       hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+                       hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+                       hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+                       hash_keys.ports.src = flkeys->ports.src;
+                       hash_keys.ports.dst = flkeys->ports.dst;
+                       hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+               } else {
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+                       hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+                       hash_keys.addrs.v6addrs.src = fl6->saddr;
+                       hash_keys.addrs.v6addrs.dst = fl6->daddr;
+                       hash_keys.ports.src = fl6->fl6_sport;
+                       hash_keys.ports.dst = fl6->fl6_dport;
+                       hash_keys.basic.ip_proto = fl6->flowi6_proto;
+               }
+               break;
        }
        mhash = flow_hash_from_keys(&hash_keys);
 
@@ -1884,7 +1920,7 @@ void ip6_route_input(struct sk_buff *skb)
                flkeys = &_flkeys;
 
        if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
-               fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys);
+               fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
        skb_dst_drop(skb);
        skb_dst_set(skb,
                    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
index 262f791..966c42a 100644 (file)
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/inet_frag.h>
+#include <net/netevent.h>
 #ifdef CONFIG_NETLABEL
 #include <net/calipso.h>
 #endif
 
+static int zero;
 static int one = 1;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 
+static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
+                                         void __user *buffer, size_t *lenp,
+                                         loff_t *ppos)
+{
+       struct net *net;
+       int ret;
+
+       net = container_of(table->data, struct net,
+                          ipv6.sysctl.multipath_hash_policy);
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (write && ret == 0)
+               call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+       return ret;
+}
 
 static struct ctl_table ipv6_table_template[] = {
        {
@@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "fib_multipath_hash_policy",
+               .data           = &init_net.ipv6.sysctl.multipath_hash_policy,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_rt6_multipath_hash_policy,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
        { }
 };
 
@@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
        ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
        ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
        ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
+       ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
 
        ipv6_route_table = ipv6_route_sysctl_init(net);
        if (!ipv6_route_table)