From 33bd5ac54dc47e002da4a395aaf9bf158dd17709 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 3 Jul 2018 14:36:21 -0700 Subject: [PATCH] net/ipv6: Revert attempt to simplify route replace and append NetworkManager likes to manage linklocal prefix routes and does so with the NLM_F_APPEND flag, breaking attempts to simplify the IPv6 route code and by extension enable multipath routes with device only nexthops. Revert f34436a43092 and these followup patches: 6eba08c3626b ("ipv6: Only emit append events for appended routes"). ce45bded6435 ("mlxsw: spectrum_router: Align with new route replace logic") 53b562df8c20 ("mlxsw: spectrum_router: Allow appending to dev-only routes") Update the fib_tests cases to reflect the old behavior. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: David Ahern --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 48 +++---- include/net/ip6_route.h | 6 + net/ipv6/ip6_fib.c | 156 ++++++++++++--------- net/ipv6/route.c | 3 +- tools/testing/selftests/net/fib_tests.sh | 41 ------ 5 files changed, 117 insertions(+), 137 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 6aaaf3d..77b2adb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4756,6 +4756,12 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) kfree(mlxsw_sp_rt6); } +static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) +{ + /* RTF_CACHE routes are ignored */ + return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; +} + static struct fib6_info * mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) { @@ -4765,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, - const struct fib6_info *nrt, bool append) + const struct fib6_info *nrt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; - if (!append) + if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace) return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { @@ -4784,7 +4790,8 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, break; if (rt->fib6_metric < nrt->fib6_metric) continue; - if (rt->fib6_metric == nrt->fib6_metric) + if (rt->fib6_metric == nrt->fib6_metric && + mlxsw_sp_fib6_rt_can_mp(rt)) return fib6_entry; if (rt->fib6_metric > nrt->fib6_metric) break; @@ -5163,7 +5170,7 @@ static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, const struct fib6_info *nrt, bool replace) { - struct mlxsw_sp_fib6_entry *fib6_entry; + struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); @@ -5172,13 +5179,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, continue; if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id) break; - if (replace && rt->fib6_metric == nrt->fib6_metric) - return fib6_entry; + if (replace && rt->fib6_metric == nrt->fib6_metric) { + if (mlxsw_sp_fib6_rt_can_mp(rt) == + mlxsw_sp_fib6_rt_can_mp(nrt)) + return fib6_entry; + if (mlxsw_sp_fib6_rt_can_mp(nrt)) + fallback = fallback ?: fib6_entry; + } if (rt->fib6_metric > nrt->fib6_metric) - return fib6_entry; + return fallback ?: fib6_entry; } - return NULL; + return fallback; } static int @@ -5304,8 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, - struct fib6_info *rt, bool replace, - bool append) + struct fib6_info *rt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5331,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, /* Before creating a new entry, try to append route to an existing * multipath entry. */ - fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append); + fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace); if (fib6_entry) { err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt); if (err) @@ -5339,14 +5350,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, return 0; } - /* We received an append event, yet did not find any route to - * append to. - */ - if (WARN_ON(append)) { - err = -EINVAL; - goto err_fib6_entry_append; - } - fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt); if (IS_ERR(fib6_entry)) { err = PTR_ERR(fib6_entry); @@ -5364,7 +5367,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, err_fib6_node_entry_link: mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry); err_fib6_entry_create: -err_fib6_entry_append: err_fib6_entry_nexthop_add: mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); return err; @@ -5715,7 +5717,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) struct mlxsw_sp_fib_event_work *fib_work = container_of(work, struct mlxsw_sp_fib_event_work, work); struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp; - bool replace, append; + bool replace; int err; rtnl_lock(); @@ -5726,10 +5728,8 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) case FIB_EVENT_ENTRY_APPEND: /* fall through */ case FIB_EVENT_ENTRY_ADD: replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE; - append = fib_work->event == FIB_EVENT_ENTRY_APPEND; err = mlxsw_sp_router_fib6_add(mlxsw_sp, - fib_work->fen6_info.rt, replace, - append); + fib_work->fen6_info.rt, replace); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); mlxsw_sp_rt6_release(fib_work->fen6_info.rt); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59656fc5..7b9c82d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) +{ + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1fb2f31..d212738 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; - struct fib6_info *iter = NULL, *match = NULL; + struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); - int append = (info->nlh && - (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (append) + if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - - /* first route that matches */ - if (!match) - match = iter; + /* If we have the same destination and the same metric, + * but not the same gateway, then the route we try to + * add is sibling to this route, increment our counter + * of siblings, and later we will add our route to the + * list. + * Only static routes (which don't have flag + * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. + */ + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) + rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; +next_iter: ins = &iter->fib6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (append && match) { + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; - if (rt->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append a REJECT route"); - return -EINVAL; - } else if (match->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append to a REJECT route"); - return -EINVAL; + /* Find the first route that have the same metric */ + sibling = leaf; + while (sibling) { + if (sibling->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); + break; + } + sibling = rcu_dereference_protected(sibling->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } - event = FIB_EVENT_ENTRY_APPEND; - rt->fib6_nsiblings = match->fib6_nsiblings; - list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); - match->fib6_nsiblings++; - /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &match->fib6_siblings, fib6_siblings) { + &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - - rt6_multipath_rebalance(match); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, event, rt, - extack); + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); if (err) return err; @@ -1062,7 +1087,7 @@ add: } } else { - struct fib6_info *tmp; + int nsiblings; if (!found) { if (add) @@ -1077,57 +1102,48 @@ add: if (err) return err; - /* if route being replaced has siblings, set tmp to - * last one, otherwise tmp is current route. this is - * used to set fib6_next for new route - */ - if (iter->fib6_nsiblings) - tmp = list_last_entry(&iter->fib6_siblings, - struct fib6_info, - fib6_siblings); - else - tmp = iter; - - /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = tmp->fib6_next; + rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); - if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); - /* delete old route */ - rt = iter; - - if (rt->fib6_nsiblings) { - struct fib6_info *tmp; - + if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, - fib6_siblings) { - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - - rt->fib6_nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + ins = &rt->fib6_next; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + while (iter) { + if (iter->fib6_metric > rt->fib6_metric) + break; + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->fib6_next; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + } else { + ins = &iter->fib6_next; + } + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } + WARN_ON(nsiblings != 0); } - - WARN_ON(rt->fib6_nsiblings != 0); - - rt->fib6_node = NULL; - fib6_purge_rt(rt, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == rt) - fn->rr_ptr = NULL; - fib6_info_release(rt); } return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 86a0e43..63f9941 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3842,7 +3842,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - iter->fib6_nsiblings) + rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4439,7 +4439,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); - cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 78245d6..0f45633 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -740,13 +740,6 @@ ipv6_rt_add() run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" log_test $? 2 "Attempt to add duplicate route - reject route" - # iproute2 prepend only sets NLM_F_CREATE - # - adds a new route; does NOT convert existing route to ECMP - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro prepend 2001:db8:104::/64 via 2001:db8:103::2" - check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024" - log_test $? 0 "Add new route for existing prefix (w/o NLM_F_EXCL)" - # route append with same prefix adds a new route # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" @@ -754,27 +747,6 @@ ipv6_rt_add() check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" log_test $? 0 "Append nexthop to existing route - gw" - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3" - check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop dev veth3 weight 1" - log_test $? 0 "Append nexthop to existing route - dev only" - - # multipath route can not have a nexthop that is a reject route - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro append unreachable 2001:db8:104::/64" - log_test $? 2 "Append nexthop to existing route - reject route" - - # reject route can not be converted to multipath route - run_cmd "$IP -6 ro flush 2001:db8:104::/64" - run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" - run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2" - log_test $? 2 "Append nexthop to existing reject route - gw" - - run_cmd "$IP -6 ro flush 2001:db8:104::/64" - run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" - run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3" - log_test $? 2 "Append nexthop to existing reject route - dev only" - # insert mpath directly add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" @@ -819,13 +791,6 @@ ipv6_rt_replace_single() check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" log_test $? 0 "Single path with multipath" - # single path with reject - # - add_initial_route6 "nexthop via 2001:db8:101::2" - run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64" - check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024" - log_test $? 0 "Single path with reject route" - # single path with single path using MULTIPATH attribute # add_initial_route6 "via 2001:db8:101::2" @@ -873,12 +838,6 @@ ipv6_rt_replace_mpath() check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024" log_test $? 0 "Multipath with single path via multipath attribute" - # multipath with reject - add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" - run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64" - check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024" - log_test $? 0 "Multipath with reject route" - # route replace fails - invalid nexthop 1 add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3" -- 2.7.4