2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 if (dev->ifindex == oif)
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
546 idev = __in6_dev_get(dev);
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
566 schedule_work(&work->work);
570 rcu_read_unlock_bh();
573 static inline void rt6_probe(struct fib6_info *rt)
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
610 ret = RT6_NUD_FAIL_PROBE;
612 read_unlock(&neigh->lock);
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 if (fib6_check_expired(rt))
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
700 struct fib6_info *rt, *match, *cont;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 rt->dst.flags |= fib6_info_dst_flags(ort);
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
998 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002 struct in6_addr *saddr)
1004 struct fib6_node *pn, *sn;
1006 if (fn->fn_flags & RTN_TL_ROOT)
1008 pn = rcu_dereference(fn->parent);
1009 sn = FIB6_SUBTREE(pn);
1011 fn = fib6_node_lookup(sn, NULL, saddr);
1014 if (fn->fn_flags & RTN_RTINFO)
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1022 struct rt6_info *rt = *prt;
1024 if (dst_hold_safe(&rt->dst))
1026 if (null_fallback) {
1027 rt = net->ipv6.ip6_null_entry;
1036 /* called with rcu_lock held */
1037 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1039 unsigned short flags = fib6_info_dst_flags(rt);
1040 struct net_device *dev = rt->fib6_nh.nh_dev;
1041 struct rt6_info *nrt;
1043 if (!fib6_info_hold_safe(rt))
1046 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1048 ip6_rt_copy_init(nrt, rt);
1050 fib6_info_release(rt);
1055 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1056 struct fib6_table *table,
1058 const struct sk_buff *skb,
1061 struct fib6_info *f6i;
1062 struct fib6_node *fn;
1063 struct rt6_info *rt;
1065 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1066 flags &= ~RT6_LOOKUP_F_IFACE;
1069 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1071 f6i = rcu_dereference(fn->leaf);
1073 f6i = net->ipv6.fib6_null_entry;
1075 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1076 fl6->flowi6_oif, flags);
1077 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1078 f6i = fib6_multipath_select(net, f6i, fl6,
1079 fl6->flowi6_oif, skb,
1082 if (f6i == net->ipv6.fib6_null_entry) {
1083 fn = fib6_backtrack(fn, &fl6->saddr);
1088 trace_fib6_table_lookup(net, f6i, table, fl6);
1090 /* Search through exception table */
1091 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1093 if (ip6_hold_safe(net, &rt, true))
1094 dst_use_noref(&rt->dst, jiffies);
1095 } else if (f6i == net->ipv6.fib6_null_entry) {
1096 rt = net->ipv6.ip6_null_entry;
1099 rt = ip6_create_rt_rcu(f6i);
1101 rt = net->ipv6.ip6_null_entry;
1111 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1112 const struct sk_buff *skb, int flags)
1114 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1116 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1118 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1119 const struct in6_addr *saddr, int oif,
1120 const struct sk_buff *skb, int strict)
1122 struct flowi6 fl6 = {
1126 struct dst_entry *dst;
1127 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1130 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1131 flags |= RT6_LOOKUP_F_HAS_SADDR;
1134 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1135 if (dst->error == 0)
1136 return (struct rt6_info *) dst;
1142 EXPORT_SYMBOL(rt6_lookup);
1144 /* ip6_ins_rt is called with FREE table->tb6_lock.
1145 * It takes new route entry, the addition fails by any reason the
1146 * route is released.
1147 * Caller must hold dst before calling it.
1150 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1151 struct netlink_ext_ack *extack)
1154 struct fib6_table *table;
1156 table = rt->fib6_table;
1157 spin_lock_bh(&table->tb6_lock);
1158 err = fib6_add(&table->tb6_root, rt, info, extack);
1159 spin_unlock_bh(&table->tb6_lock);
1164 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1166 struct nl_info info = { .nl_net = net, };
1168 return __ip6_ins_rt(rt, &info, NULL);
1171 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1172 const struct in6_addr *daddr,
1173 const struct in6_addr *saddr)
1175 struct net_device *dev;
1176 struct rt6_info *rt;
1182 if (!fib6_info_hold_safe(ort))
1185 dev = ip6_rt_get_dev_rcu(ort);
1186 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1188 fib6_info_release(ort);
1192 ip6_rt_copy_init(rt, ort);
1193 rt->rt6i_flags |= RTF_CACHE;
1194 rt->dst.flags |= DST_HOST;
1195 rt->rt6i_dst.addr = *daddr;
1196 rt->rt6i_dst.plen = 128;
1198 if (!rt6_is_gw_or_nonexthop(ort)) {
1199 if (ort->fib6_dst.plen != 128 &&
1200 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1201 rt->rt6i_flags |= RTF_ANYCAST;
1202 #ifdef CONFIG_IPV6_SUBTREES
1203 if (rt->rt6i_src.plen && saddr) {
1204 rt->rt6i_src.addr = *saddr;
1205 rt->rt6i_src.plen = 128;
1213 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1215 unsigned short flags = fib6_info_dst_flags(rt);
1216 struct net_device *dev;
1217 struct rt6_info *pcpu_rt;
1219 if (!fib6_info_hold_safe(rt))
1223 dev = ip6_rt_get_dev_rcu(rt);
1224 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1227 fib6_info_release(rt);
1230 ip6_rt_copy_init(pcpu_rt, rt);
1231 pcpu_rt->rt6i_flags |= RTF_PCPU;
1235 /* It should be called with rcu_read_lock() acquired */
1236 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1238 struct rt6_info *pcpu_rt, **p;
1240 p = this_cpu_ptr(rt->rt6i_pcpu);
1244 ip6_hold_safe(NULL, &pcpu_rt, false);
1249 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1250 struct fib6_info *rt)
1252 struct rt6_info *pcpu_rt, *prev, **p;
1254 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1256 dst_hold(&net->ipv6.ip6_null_entry->dst);
1257 return net->ipv6.ip6_null_entry;
1260 dst_hold(&pcpu_rt->dst);
1261 p = this_cpu_ptr(rt->rt6i_pcpu);
1262 prev = cmpxchg(p, NULL, pcpu_rt);
1268 /* exception hash table implementation
1270 static DEFINE_SPINLOCK(rt6_exception_lock);
1272 /* Remove rt6_ex from hash table and free the memory
1273 * Caller must hold rt6_exception_lock
1275 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1276 struct rt6_exception *rt6_ex)
1280 if (!bucket || !rt6_ex)
1283 net = dev_net(rt6_ex->rt6i->dst.dev);
1284 hlist_del_rcu(&rt6_ex->hlist);
1285 dst_release(&rt6_ex->rt6i->dst);
1286 kfree_rcu(rt6_ex, rcu);
1287 WARN_ON_ONCE(!bucket->depth);
1289 net->ipv6.rt6_stats->fib_rt_cache--;
1292 /* Remove oldest rt6_ex in bucket and free the memory
1293 * Caller must hold rt6_exception_lock
1295 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1297 struct rt6_exception *rt6_ex, *oldest = NULL;
1302 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1303 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1306 rt6_remove_exception(bucket, oldest);
1309 static u32 rt6_exception_hash(const struct in6_addr *dst,
1310 const struct in6_addr *src)
1312 static u32 seed __read_mostly;
1315 net_get_random_once(&seed, sizeof(seed));
1316 val = jhash(dst, sizeof(*dst), seed);
1318 #ifdef CONFIG_IPV6_SUBTREES
1320 val = jhash(src, sizeof(*src), val);
1322 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1325 /* Helper function to find the cached rt in the hash table
1326 * and update bucket pointer to point to the bucket for this
1327 * (daddr, saddr) pair
1328 * Caller must hold rt6_exception_lock
1330 static struct rt6_exception *
1331 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1332 const struct in6_addr *daddr,
1333 const struct in6_addr *saddr)
1335 struct rt6_exception *rt6_ex;
1338 if (!(*bucket) || !daddr)
1341 hval = rt6_exception_hash(daddr, saddr);
1344 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1345 struct rt6_info *rt6 = rt6_ex->rt6i;
1346 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1348 #ifdef CONFIG_IPV6_SUBTREES
1349 if (matched && saddr)
1350 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1358 /* Helper function to find the cached rt in the hash table
1359 * and update bucket pointer to point to the bucket for this
1360 * (daddr, saddr) pair
1361 * Caller must hold rcu_read_lock()
1363 static struct rt6_exception *
1364 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1365 const struct in6_addr *daddr,
1366 const struct in6_addr *saddr)
1368 struct rt6_exception *rt6_ex;
1371 WARN_ON_ONCE(!rcu_read_lock_held());
1373 if (!(*bucket) || !daddr)
1376 hval = rt6_exception_hash(daddr, saddr);
1379 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1380 struct rt6_info *rt6 = rt6_ex->rt6i;
1381 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1383 #ifdef CONFIG_IPV6_SUBTREES
1384 if (matched && saddr)
1385 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1393 static unsigned int fib6_mtu(const struct fib6_info *rt)
1397 if (rt->fib6_pmtu) {
1398 mtu = rt->fib6_pmtu;
1400 struct net_device *dev = fib6_info_nh_dev(rt);
1401 struct inet6_dev *idev;
1404 idev = __in6_dev_get(dev);
1405 mtu = idev->cnf.mtu6;
1409 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1411 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1414 static int rt6_insert_exception(struct rt6_info *nrt,
1415 struct fib6_info *ort)
1417 struct net *net = dev_net(nrt->dst.dev);
1418 struct rt6_exception_bucket *bucket;
1419 struct in6_addr *src_key = NULL;
1420 struct rt6_exception *rt6_ex;
1423 spin_lock_bh(&rt6_exception_lock);
1425 if (ort->exception_bucket_flushed) {
1430 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1431 lockdep_is_held(&rt6_exception_lock));
1433 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1439 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1442 #ifdef CONFIG_IPV6_SUBTREES
1443 /* rt6i_src.plen != 0 indicates ort is in subtree
1444 * and exception table is indexed by a hash of
1445 * both rt6i_dst and rt6i_src.
1446 * Otherwise, the exception table is indexed by
1447 * a hash of only rt6i_dst.
1449 if (ort->fib6_src.plen)
1450 src_key = &nrt->rt6i_src.addr;
1453 /* Update rt6i_prefsrc as it could be changed
1454 * in rt6_remove_prefsrc()
1456 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1457 /* rt6_mtu_change() might lower mtu on ort.
1458 * Only insert this exception route if its mtu
1459 * is less than ort's mtu value.
1461 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1466 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1469 rt6_remove_exception(bucket, rt6_ex);
1471 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1477 rt6_ex->stamp = jiffies;
1478 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1480 net->ipv6.rt6_stats->fib_rt_cache++;
1482 if (bucket->depth > FIB6_MAX_DEPTH)
1483 rt6_exception_remove_oldest(bucket);
1486 spin_unlock_bh(&rt6_exception_lock);
1488 /* Update fn->fn_sernum to invalidate all cached dst */
1490 spin_lock_bh(&ort->fib6_table->tb6_lock);
1491 fib6_update_sernum(net, ort);
1492 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1493 fib6_force_start_gc(net);
1499 void rt6_flush_exceptions(struct fib6_info *rt)
1501 struct rt6_exception_bucket *bucket;
1502 struct rt6_exception *rt6_ex;
1503 struct hlist_node *tmp;
1506 spin_lock_bh(&rt6_exception_lock);
1507 /* Prevent rt6_insert_exception() to recreate the bucket list */
1508 rt->exception_bucket_flushed = 1;
1510 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1511 lockdep_is_held(&rt6_exception_lock));
1515 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1516 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1517 rt6_remove_exception(bucket, rt6_ex);
1518 WARN_ON_ONCE(bucket->depth);
1523 spin_unlock_bh(&rt6_exception_lock);
1526 /* Find cached rt in the hash table inside passed in rt
1527 * Caller has to hold rcu_read_lock()
1529 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1530 struct in6_addr *daddr,
1531 struct in6_addr *saddr)
1533 struct rt6_exception_bucket *bucket;
1534 struct in6_addr *src_key = NULL;
1535 struct rt6_exception *rt6_ex;
1536 struct rt6_info *res = NULL;
1538 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1540 #ifdef CONFIG_IPV6_SUBTREES
1541 /* rt6i_src.plen != 0 indicates rt is in subtree
1542 * and exception table is indexed by a hash of
1543 * both rt6i_dst and rt6i_src.
1544 * Otherwise, the exception table is indexed by
1545 * a hash of only rt6i_dst.
1547 if (rt->fib6_src.plen)
1550 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1552 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1558 /* Remove the passed in cached rt from the hash table that contains it */
1559 static int rt6_remove_exception_rt(struct rt6_info *rt)
1561 struct rt6_exception_bucket *bucket;
1562 struct in6_addr *src_key = NULL;
1563 struct rt6_exception *rt6_ex;
1564 struct fib6_info *from;
1567 from = rcu_dereference(rt->from);
1569 !(rt->rt6i_flags & RTF_CACHE))
1572 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1575 spin_lock_bh(&rt6_exception_lock);
1576 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1577 lockdep_is_held(&rt6_exception_lock));
1578 #ifdef CONFIG_IPV6_SUBTREES
1579 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1580 * and exception table is indexed by a hash of
1581 * both rt6i_dst and rt6i_src.
1582 * Otherwise, the exception table is indexed by
1583 * a hash of only rt6i_dst.
1585 if (from->fib6_src.plen)
1586 src_key = &rt->rt6i_src.addr;
1588 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1592 rt6_remove_exception(bucket, rt6_ex);
1598 spin_unlock_bh(&rt6_exception_lock);
1602 /* Find rt6_ex which contains the passed in rt cache and
1605 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1607 struct rt6_exception_bucket *bucket;
1608 struct fib6_info *from = rt->from;
1609 struct in6_addr *src_key = NULL;
1610 struct rt6_exception *rt6_ex;
1613 !(rt->rt6i_flags & RTF_CACHE))
1617 bucket = rcu_dereference(from->rt6i_exception_bucket);
1619 #ifdef CONFIG_IPV6_SUBTREES
1620 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1621 * and exception table is indexed by a hash of
1622 * both rt6i_dst and rt6i_src.
1623 * Otherwise, the exception table is indexed by
1624 * a hash of only rt6i_dst.
1626 if (from->fib6_src.plen)
1627 src_key = &rt->rt6i_src.addr;
1629 rt6_ex = __rt6_find_exception_rcu(&bucket,
1633 rt6_ex->stamp = jiffies;
1638 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1640 struct rt6_exception_bucket *bucket;
1641 struct rt6_exception *rt6_ex;
1644 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1645 lockdep_is_held(&rt6_exception_lock));
1648 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1649 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1650 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1657 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1658 struct rt6_info *rt, int mtu)
1660 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1661 * lowest MTU in the path: always allow updating the route PMTU to
1662 * reflect PMTU decreases.
1664 * If the new MTU is higher, and the route PMTU is equal to the local
1665 * MTU, this means the old MTU is the lowest in the path, so allow
1666 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1670 if (dst_mtu(&rt->dst) >= mtu)
1673 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1679 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1680 struct fib6_info *rt, int mtu)
1682 struct rt6_exception_bucket *bucket;
1683 struct rt6_exception *rt6_ex;
1686 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1687 lockdep_is_held(&rt6_exception_lock));
1692 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1693 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1694 struct rt6_info *entry = rt6_ex->rt6i;
1696 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1697 * route), the metrics of its rt->from have already
1700 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1701 rt6_mtu_change_route_allowed(idev, entry, mtu))
1702 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1708 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1710 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1711 struct in6_addr *gateway)
1713 struct rt6_exception_bucket *bucket;
1714 struct rt6_exception *rt6_ex;
1715 struct hlist_node *tmp;
1718 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1721 spin_lock_bh(&rt6_exception_lock);
1722 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1723 lockdep_is_held(&rt6_exception_lock));
1726 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1727 hlist_for_each_entry_safe(rt6_ex, tmp,
1728 &bucket->chain, hlist) {
1729 struct rt6_info *entry = rt6_ex->rt6i;
1731 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1732 RTF_CACHE_GATEWAY &&
1733 ipv6_addr_equal(gateway,
1734 &entry->rt6i_gateway)) {
1735 rt6_remove_exception(bucket, rt6_ex);
1742 spin_unlock_bh(&rt6_exception_lock);
1745 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1746 struct rt6_exception *rt6_ex,
1747 struct fib6_gc_args *gc_args,
1750 struct rt6_info *rt = rt6_ex->rt6i;
1752 /* we are pruning and obsoleting aged-out and non gateway exceptions
1753 * even if others have still references to them, so that on next
1754 * dst_check() such references can be dropped.
1755 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1756 * expired, independently from their aging, as per RFC 8201 section 4
1758 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1759 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1760 RT6_TRACE("aging clone %p\n", rt);
1761 rt6_remove_exception(bucket, rt6_ex);
1764 } else if (time_after(jiffies, rt->dst.expires)) {
1765 RT6_TRACE("purging expired route %p\n", rt);
1766 rt6_remove_exception(bucket, rt6_ex);
1770 if (rt->rt6i_flags & RTF_GATEWAY) {
1771 struct neighbour *neigh;
1772 __u8 neigh_flags = 0;
1774 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1776 neigh_flags = neigh->flags;
1778 if (!(neigh_flags & NTF_ROUTER)) {
1779 RT6_TRACE("purging route %p via non-router but gateway\n",
1781 rt6_remove_exception(bucket, rt6_ex);
1789 void rt6_age_exceptions(struct fib6_info *rt,
1790 struct fib6_gc_args *gc_args,
1793 struct rt6_exception_bucket *bucket;
1794 struct rt6_exception *rt6_ex;
1795 struct hlist_node *tmp;
1798 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1802 spin_lock(&rt6_exception_lock);
1803 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1804 lockdep_is_held(&rt6_exception_lock));
1807 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1808 hlist_for_each_entry_safe(rt6_ex, tmp,
1809 &bucket->chain, hlist) {
1810 rt6_age_examine_exception(bucket, rt6_ex,
1816 spin_unlock(&rt6_exception_lock);
1817 rcu_read_unlock_bh();
1820 /* must be called with rcu lock held */
1821 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1822 int oif, struct flowi6 *fl6, int strict)
1824 struct fib6_node *fn, *saved_fn;
1825 struct fib6_info *f6i;
1827 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1830 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1834 f6i = rt6_select(net, fn, oif, strict);
1835 if (f6i == net->ipv6.fib6_null_entry) {
1836 fn = fib6_backtrack(fn, &fl6->saddr);
1838 goto redo_rt6_select;
1839 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1840 /* also consider unreachable route */
1841 strict &= ~RT6_LOOKUP_F_REACHABLE;
1843 goto redo_rt6_select;
1847 trace_fib6_table_lookup(net, f6i, table, fl6);
1852 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1853 int oif, struct flowi6 *fl6,
1854 const struct sk_buff *skb, int flags)
1856 struct fib6_info *f6i;
1857 struct rt6_info *rt;
1860 strict |= flags & RT6_LOOKUP_F_IFACE;
1861 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1862 if (net->ipv6.devconf_all->forwarding == 0)
1863 strict |= RT6_LOOKUP_F_REACHABLE;
1867 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1868 if (f6i->fib6_nsiblings)
1869 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1871 if (f6i == net->ipv6.fib6_null_entry) {
1872 rt = net->ipv6.ip6_null_entry;
1878 /*Search through exception table */
1879 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1881 if (ip6_hold_safe(net, &rt, true))
1882 dst_use_noref(&rt->dst, jiffies);
1886 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1887 !(f6i->fib6_flags & RTF_GATEWAY))) {
1888 /* Create a RTF_CACHE clone which will not be
1889 * owned by the fib6 tree. It is for the special case where
1890 * the daddr in the skb during the neighbor look-up is different
1891 * from the fl6->daddr used to look-up route here.
1893 struct rt6_info *uncached_rt;
1895 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1900 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1901 * No need for another dst_hold()
1903 rt6_uncached_list_add(uncached_rt);
1904 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1906 uncached_rt = net->ipv6.ip6_null_entry;
1907 dst_hold(&uncached_rt->dst);
1912 /* Get a percpu copy */
1914 struct rt6_info *pcpu_rt;
1917 pcpu_rt = rt6_get_pcpu_route(f6i);
1920 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1928 EXPORT_SYMBOL_GPL(ip6_pol_route);
1930 static struct rt6_info *ip6_pol_route_input(struct net *net,
1931 struct fib6_table *table,
1933 const struct sk_buff *skb,
1936 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1939 struct dst_entry *ip6_route_input_lookup(struct net *net,
1940 struct net_device *dev,
1942 const struct sk_buff *skb,
1945 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1946 flags |= RT6_LOOKUP_F_IFACE;
1948 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1950 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1952 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1953 struct flow_keys *keys,
1954 struct flow_keys *flkeys)
1956 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1957 const struct ipv6hdr *key_iph = outer_iph;
1958 struct flow_keys *_flkeys = flkeys;
1959 const struct ipv6hdr *inner_iph;
1960 const struct icmp6hdr *icmph;
1961 struct ipv6hdr _inner_iph;
1962 struct icmp6hdr _icmph;
1964 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1967 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1968 sizeof(_icmph), &_icmph);
1972 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1973 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1974 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1975 icmph->icmp6_type != ICMPV6_PARAMPROB)
1978 inner_iph = skb_header_pointer(skb,
1979 skb_transport_offset(skb) + sizeof(*icmph),
1980 sizeof(_inner_iph), &_inner_iph);
1984 key_iph = inner_iph;
1988 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1989 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1990 keys->tags.flow_label = _flkeys->tags.flow_label;
1991 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1993 keys->addrs.v6addrs.src = key_iph->saddr;
1994 keys->addrs.v6addrs.dst = key_iph->daddr;
1995 keys->tags.flow_label = ip6_flowlabel(key_iph);
1996 keys->basic.ip_proto = key_iph->nexthdr;
2000 /* if skb is set it will be used and fl6 can be NULL */
2001 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2002 const struct sk_buff *skb, struct flow_keys *flkeys)
2004 struct flow_keys hash_keys;
2007 switch (ip6_multipath_hash_policy(net)) {
2009 memset(&hash_keys, 0, sizeof(hash_keys));
2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2014 hash_keys.addrs.v6addrs.src = fl6->saddr;
2015 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2016 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2017 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2023 struct flow_keys keys;
2025 /* short-circuit if we already have L4 hash present */
2027 return skb_get_hash_raw(skb) >> 1;
2029 memset(&hash_keys, 0, sizeof(hash_keys));
2032 skb_flow_dissect_flow_keys(skb, &keys, flag);
2035 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2036 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2037 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2038 hash_keys.ports.src = flkeys->ports.src;
2039 hash_keys.ports.dst = flkeys->ports.dst;
2040 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2042 memset(&hash_keys, 0, sizeof(hash_keys));
2043 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2044 hash_keys.addrs.v6addrs.src = fl6->saddr;
2045 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2046 hash_keys.ports.src = fl6->fl6_sport;
2047 hash_keys.ports.dst = fl6->fl6_dport;
2048 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2052 mhash = flow_hash_from_keys(&hash_keys);
2057 void ip6_route_input(struct sk_buff *skb)
2059 const struct ipv6hdr *iph = ipv6_hdr(skb);
2060 struct net *net = dev_net(skb->dev);
2061 int flags = RT6_LOOKUP_F_HAS_SADDR;
2062 struct ip_tunnel_info *tun_info;
2063 struct flowi6 fl6 = {
2064 .flowi6_iif = skb->dev->ifindex,
2065 .daddr = iph->daddr,
2066 .saddr = iph->saddr,
2067 .flowlabel = ip6_flowinfo(iph),
2068 .flowi6_mark = skb->mark,
2069 .flowi6_proto = iph->nexthdr,
2071 struct flow_keys *flkeys = NULL, _flkeys;
2073 tun_info = skb_tunnel_info(skb);
2074 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2075 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2077 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2080 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2081 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2084 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2087 static struct rt6_info *ip6_pol_route_output(struct net *net,
2088 struct fib6_table *table,
2090 const struct sk_buff *skb,
2093 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2096 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2097 struct flowi6 *fl6, int flags)
2101 if (rt6_need_strict(&fl6->daddr)) {
2102 struct dst_entry *dst;
2104 dst = l3mdev_link_scope_lookup(net, fl6);
2109 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2111 any_src = ipv6_addr_any(&fl6->saddr);
2112 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2113 (fl6->flowi6_oif && any_src))
2114 flags |= RT6_LOOKUP_F_IFACE;
2117 flags |= RT6_LOOKUP_F_HAS_SADDR;
2119 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2121 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2123 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2125 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2127 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2128 struct net_device *loopback_dev = net->loopback_dev;
2129 struct dst_entry *new = NULL;
2131 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2132 DST_OBSOLETE_DEAD, 0);
2135 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2139 new->input = dst_discard;
2140 new->output = dst_discard_out;
2142 dst_copy_metrics(new, &ort->dst);
2144 rt->rt6i_idev = in6_dev_get(loopback_dev);
2145 rt->rt6i_gateway = ort->rt6i_gateway;
2146 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2148 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2149 #ifdef CONFIG_IPV6_SUBTREES
2150 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2154 dst_release(dst_orig);
2155 return new ? new : ERR_PTR(-ENOMEM);
2159 * Destination cache support functions
2162 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2166 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2169 if (fib6_check_expired(f6i))
2175 static struct dst_entry *rt6_check(struct rt6_info *rt,
2176 struct fib6_info *from,
2181 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2182 rt_cookie != cookie)
2185 if (rt6_check_expired(rt))
2191 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2192 struct fib6_info *from,
2195 if (!__rt6_check_expired(rt) &&
2196 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2197 fib6_check(from, cookie))
2203 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2205 struct dst_entry *dst_ret;
2206 struct fib6_info *from;
2207 struct rt6_info *rt;
2209 rt = container_of(dst, struct rt6_info, dst);
2213 /* All IPV6 dsts are created with ->obsolete set to the value
2214 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2215 * into this function always.
2218 from = rcu_dereference(rt->from);
2220 if (from && (rt->rt6i_flags & RTF_PCPU ||
2221 unlikely(!list_empty(&rt->rt6i_uncached))))
2222 dst_ret = rt6_dst_from_check(rt, from, cookie);
2224 dst_ret = rt6_check(rt, from, cookie);
2231 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2233 struct rt6_info *rt = (struct rt6_info *) dst;
2236 if (rt->rt6i_flags & RTF_CACHE) {
2238 if (rt6_check_expired(rt)) {
2239 rt6_remove_exception_rt(rt);
2251 static void ip6_link_failure(struct sk_buff *skb)
2253 struct rt6_info *rt;
2255 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2257 rt = (struct rt6_info *) skb_dst(skb);
2260 if (rt->rt6i_flags & RTF_CACHE) {
2261 if (dst_hold_safe(&rt->dst))
2262 rt6_remove_exception_rt(rt);
2264 struct fib6_info *from;
2265 struct fib6_node *fn;
2267 from = rcu_dereference(rt->from);
2269 fn = rcu_dereference(from->fib6_node);
2270 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2278 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2280 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2281 struct fib6_info *from;
2284 from = rcu_dereference(rt0->from);
2286 rt0->dst.expires = from->expires;
2290 dst_set_expires(&rt0->dst, timeout);
2291 rt0->rt6i_flags |= RTF_EXPIRES;
2294 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2296 struct net *net = dev_net(rt->dst.dev);
2298 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2299 rt->rt6i_flags |= RTF_MODIFIED;
2300 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2303 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2308 from_set = !!rcu_dereference(rt->from);
2311 return !(rt->rt6i_flags & RTF_CACHE) &&
2312 (rt->rt6i_flags & RTF_PCPU || from_set);
2315 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2316 const struct ipv6hdr *iph, u32 mtu)
2318 const struct in6_addr *daddr, *saddr;
2319 struct rt6_info *rt6 = (struct rt6_info *)dst;
2321 if (dst_metric_locked(dst, RTAX_MTU))
2325 daddr = &iph->daddr;
2326 saddr = &iph->saddr;
2328 daddr = &sk->sk_v6_daddr;
2329 saddr = &inet6_sk(sk)->saddr;
2334 dst_confirm_neigh(dst, daddr);
2335 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2336 if (mtu >= dst_mtu(dst))
2339 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2340 rt6_do_update_pmtu(rt6, mtu);
2341 /* update rt6_ex->stamp for cache */
2342 if (rt6->rt6i_flags & RTF_CACHE)
2343 rt6_update_exception_stamp_rt(rt6);
2345 struct fib6_info *from;
2346 struct rt6_info *nrt6;
2349 from = rcu_dereference(rt6->from);
2350 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2352 rt6_do_update_pmtu(nrt6, mtu);
2353 if (rt6_insert_exception(nrt6, from))
2354 dst_release_immediate(&nrt6->dst);
2360 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2361 struct sk_buff *skb, u32 mtu)
2363 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2366 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2367 int oif, u32 mark, kuid_t uid)
2369 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2370 struct dst_entry *dst;
2373 memset(&fl6, 0, sizeof(fl6));
2374 fl6.flowi6_oif = oif;
2375 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2376 fl6.daddr = iph->daddr;
2377 fl6.saddr = iph->saddr;
2378 fl6.flowlabel = ip6_flowinfo(iph);
2379 fl6.flowi6_uid = uid;
2381 dst = ip6_route_output(net, NULL, &fl6);
2383 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2386 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2388 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2390 struct dst_entry *dst;
2392 ip6_update_pmtu(skb, sock_net(sk), mtu,
2393 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2395 dst = __sk_dst_get(sk);
2396 if (!dst || !dst->obsolete ||
2397 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2401 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2402 ip6_datagram_dst_update(sk, false);
2405 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2407 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2408 const struct flowi6 *fl6)
2410 #ifdef CONFIG_IPV6_SUBTREES
2411 struct ipv6_pinfo *np = inet6_sk(sk);
2414 ip6_dst_store(sk, dst,
2415 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2416 &sk->sk_v6_daddr : NULL,
2417 #ifdef CONFIG_IPV6_SUBTREES
2418 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2424 /* Handle redirects */
2425 struct ip6rd_flowi {
2427 struct in6_addr gateway;
2430 static struct rt6_info *__ip6_route_redirect(struct net *net,
2431 struct fib6_table *table,
2433 const struct sk_buff *skb,
2436 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2437 struct rt6_info *ret = NULL, *rt_cache;
2438 struct fib6_info *rt;
2439 struct fib6_node *fn;
2441 /* Get the "current" route for this destination and
2442 * check if the redirect has come from appropriate router.
2444 * RFC 4861 specifies that redirects should only be
2445 * accepted if they come from the nexthop to the target.
2446 * Due to the way the routes are chosen, this notion
2447 * is a bit fuzzy and one might need to check all possible
2452 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2454 for_each_fib6_node_rt_rcu(fn) {
2455 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2457 if (fib6_check_expired(rt))
2459 if (rt->fib6_flags & RTF_REJECT)
2461 if (!(rt->fib6_flags & RTF_GATEWAY))
2463 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2465 /* rt_cache's gateway might be different from its 'parent'
2466 * in the case of an ip redirect.
2467 * So we keep searching in the exception table if the gateway
2470 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2471 rt_cache = rt6_find_cached_rt(rt,
2475 ipv6_addr_equal(&rdfl->gateway,
2476 &rt_cache->rt6i_gateway)) {
2486 rt = net->ipv6.fib6_null_entry;
2487 else if (rt->fib6_flags & RTF_REJECT) {
2488 ret = net->ipv6.ip6_null_entry;
2492 if (rt == net->ipv6.fib6_null_entry) {
2493 fn = fib6_backtrack(fn, &fl6->saddr);
2500 ip6_hold_safe(net, &ret, true);
2502 ret = ip6_create_rt_rcu(rt);
2506 trace_fib6_table_lookup(net, rt, table, fl6);
2510 static struct dst_entry *ip6_route_redirect(struct net *net,
2511 const struct flowi6 *fl6,
2512 const struct sk_buff *skb,
2513 const struct in6_addr *gateway)
2515 int flags = RT6_LOOKUP_F_HAS_SADDR;
2516 struct ip6rd_flowi rdfl;
2519 rdfl.gateway = *gateway;
2521 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2522 flags, __ip6_route_redirect);
2525 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2528 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2529 struct dst_entry *dst;
2532 memset(&fl6, 0, sizeof(fl6));
2533 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2534 fl6.flowi6_oif = oif;
2535 fl6.flowi6_mark = mark;
2536 fl6.daddr = iph->daddr;
2537 fl6.saddr = iph->saddr;
2538 fl6.flowlabel = ip6_flowinfo(iph);
2539 fl6.flowi6_uid = uid;
2541 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2542 rt6_do_redirect(dst, NULL, skb);
2545 EXPORT_SYMBOL_GPL(ip6_redirect);
2547 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2550 const struct ipv6hdr *iph = ipv6_hdr(skb);
2551 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2552 struct dst_entry *dst;
2555 memset(&fl6, 0, sizeof(fl6));
2556 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2557 fl6.flowi6_oif = oif;
2558 fl6.flowi6_mark = mark;
2559 fl6.daddr = msg->dest;
2560 fl6.saddr = iph->daddr;
2561 fl6.flowi6_uid = sock_net_uid(net, NULL);
2563 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2564 rt6_do_redirect(dst, NULL, skb);
2568 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2570 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2573 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2575 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2577 struct net_device *dev = dst->dev;
2578 unsigned int mtu = dst_mtu(dst);
2579 struct net *net = dev_net(dev);
2581 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2583 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2584 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2587 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2588 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2589 * IPV6_MAXPLEN is also valid and means: "any MSS,
2590 * rely only on pmtu discovery"
2592 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2597 static unsigned int ip6_mtu(const struct dst_entry *dst)
2599 struct inet6_dev *idev;
2602 mtu = dst_metric_raw(dst, RTAX_MTU);
2609 idev = __in6_dev_get(dst->dev);
2611 mtu = idev->cnf.mtu6;
2615 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2617 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2621 * 1. mtu on route is locked - use it
2622 * 2. mtu from nexthop exception
2623 * 3. mtu from egress device
2625 * based on ip6_dst_mtu_forward and exception logic of
2626 * rt6_find_cached_rt; called with rcu_read_lock
2628 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2629 struct in6_addr *saddr)
2631 struct rt6_exception_bucket *bucket;
2632 struct rt6_exception *rt6_ex;
2633 struct in6_addr *src_key;
2634 struct inet6_dev *idev;
2637 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2638 mtu = f6i->fib6_pmtu;
2644 #ifdef CONFIG_IPV6_SUBTREES
2645 if (f6i->fib6_src.plen)
2649 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2650 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2651 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2652 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2655 struct net_device *dev = fib6_info_nh_dev(f6i);
2658 idev = __in6_dev_get(dev);
2659 if (idev && idev->cnf.mtu6 > mtu)
2660 mtu = idev->cnf.mtu6;
2663 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2665 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2668 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2671 struct dst_entry *dst;
2672 struct rt6_info *rt;
2673 struct inet6_dev *idev = in6_dev_get(dev);
2674 struct net *net = dev_net(dev);
2676 if (unlikely(!idev))
2677 return ERR_PTR(-ENODEV);
2679 rt = ip6_dst_alloc(net, dev, 0);
2680 if (unlikely(!rt)) {
2682 dst = ERR_PTR(-ENOMEM);
2686 rt->dst.flags |= DST_HOST;
2687 rt->dst.input = ip6_input;
2688 rt->dst.output = ip6_output;
2689 rt->rt6i_gateway = fl6->daddr;
2690 rt->rt6i_dst.addr = fl6->daddr;
2691 rt->rt6i_dst.plen = 128;
2692 rt->rt6i_idev = idev;
2693 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2695 /* Add this dst into uncached_list so that rt6_disable_ip() can
2696 * do proper release of the net_device
2698 rt6_uncached_list_add(rt);
2699 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2701 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2707 static int ip6_dst_gc(struct dst_ops *ops)
2709 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2710 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2711 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2712 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2713 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2714 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2717 entries = dst_entries_get_fast(ops);
2718 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2719 entries <= rt_max_size)
2722 net->ipv6.ip6_rt_gc_expire++;
2723 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2724 entries = dst_entries_get_slow(ops);
2725 if (entries < ops->gc_thresh)
2726 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2728 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2729 return entries > rt_max_size;
2732 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2733 struct fib6_config *cfg)
2735 struct dst_metrics *p;
2740 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2744 refcount_set(&p->refcnt, 1);
2745 rt->fib6_metrics = p;
2747 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2750 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2751 struct fib6_config *cfg,
2752 const struct in6_addr *gw_addr,
2753 u32 tbid, int flags)
2755 struct flowi6 fl6 = {
2756 .flowi6_oif = cfg->fc_ifindex,
2758 .saddr = cfg->fc_prefsrc,
2760 struct fib6_table *table;
2761 struct rt6_info *rt;
2763 table = fib6_get_table(net, tbid);
2767 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2768 flags |= RT6_LOOKUP_F_HAS_SADDR;
2770 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2771 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2773 /* if table lookup failed, fall back to full lookup */
2774 if (rt == net->ipv6.ip6_null_entry) {
2782 static int ip6_route_check_nh_onlink(struct net *net,
2783 struct fib6_config *cfg,
2784 const struct net_device *dev,
2785 struct netlink_ext_ack *extack)
2787 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2788 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2789 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2790 struct rt6_info *grt;
2794 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2796 if (!grt->dst.error &&
2797 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2798 NL_SET_ERR_MSG(extack,
2799 "Nexthop has invalid gateway or device mismatch");
2809 static int ip6_route_check_nh(struct net *net,
2810 struct fib6_config *cfg,
2811 struct net_device **_dev,
2812 struct inet6_dev **idev)
2814 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2815 struct net_device *dev = _dev ? *_dev : NULL;
2816 struct rt6_info *grt = NULL;
2817 int err = -EHOSTUNREACH;
2819 if (cfg->fc_table) {
2820 int flags = RT6_LOOKUP_F_IFACE;
2822 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2823 cfg->fc_table, flags);
2825 if (grt->rt6i_flags & RTF_GATEWAY ||
2826 (dev && dev != grt->dst.dev)) {
2834 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2840 if (dev != grt->dst.dev) {
2845 *_dev = dev = grt->dst.dev;
2846 *idev = grt->rt6i_idev;
2848 in6_dev_hold(grt->rt6i_idev);
2851 if (!(grt->rt6i_flags & RTF_GATEWAY))
2860 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2861 struct net_device **_dev, struct inet6_dev **idev,
2862 struct netlink_ext_ack *extack)
2864 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2865 int gwa_type = ipv6_addr_type(gw_addr);
2866 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2867 const struct net_device *dev = *_dev;
2868 bool need_addr_check = !dev;
2871 /* if gw_addr is local we will fail to detect this in case
2872 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2873 * will return already-added prefix route via interface that
2874 * prefix route was assigned to, which might be non-loopback.
2877 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2878 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2882 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2883 /* IPv6 strictly inhibits using not link-local
2884 * addresses as nexthop address.
2885 * Otherwise, router will not able to send redirects.
2886 * It is very good, but in some (rare!) circumstances
2887 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2888 * some exceptions. --ANK
2889 * We allow IPv4-mapped nexthops to support RFC4798-type
2892 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2893 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2897 if (cfg->fc_flags & RTNH_F_ONLINK)
2898 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2900 err = ip6_route_check_nh(net, cfg, _dev, idev);
2906 /* reload in case device was changed */
2911 NL_SET_ERR_MSG(extack, "Egress device not specified");
2913 } else if (dev->flags & IFF_LOOPBACK) {
2914 NL_SET_ERR_MSG(extack,
2915 "Egress device can not be loopback device for this route");
2919 /* if we did not check gw_addr above, do so now that the
2920 * egress device has been resolved.
2922 if (need_addr_check &&
2923 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2924 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2933 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2935 struct netlink_ext_ack *extack)
2937 struct net *net = cfg->fc_nlinfo.nl_net;
2938 struct fib6_info *rt = NULL;
2939 struct net_device *dev = NULL;
2940 struct inet6_dev *idev = NULL;
2941 struct fib6_table *table;
2945 /* RTF_PCPU is an internal flag; can not be set by userspace */
2946 if (cfg->fc_flags & RTF_PCPU) {
2947 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2951 /* RTF_CACHE is an internal flag; can not be set by userspace */
2952 if (cfg->fc_flags & RTF_CACHE) {
2953 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2957 if (cfg->fc_type > RTN_MAX) {
2958 NL_SET_ERR_MSG(extack, "Invalid route type");
2962 if (cfg->fc_dst_len > 128) {
2963 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2966 if (cfg->fc_src_len > 128) {
2967 NL_SET_ERR_MSG(extack, "Invalid source address length");
2970 #ifndef CONFIG_IPV6_SUBTREES
2971 if (cfg->fc_src_len) {
2972 NL_SET_ERR_MSG(extack,
2973 "Specifying source address requires IPV6_SUBTREES to be enabled");
2977 if (cfg->fc_ifindex) {
2979 dev = dev_get_by_index(net, cfg->fc_ifindex);
2982 idev = in6_dev_get(dev);
2987 if (cfg->fc_metric == 0)
2988 cfg->fc_metric = IP6_RT_PRIO_USER;
2990 if (cfg->fc_flags & RTNH_F_ONLINK) {
2992 NL_SET_ERR_MSG(extack,
2993 "Nexthop device required for onlink");
2998 if (!(dev->flags & IFF_UP)) {
2999 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3006 if (cfg->fc_nlinfo.nlh &&
3007 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3008 table = fib6_get_table(net, cfg->fc_table);
3010 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3011 table = fib6_new_table(net, cfg->fc_table);
3014 table = fib6_new_table(net, cfg->fc_table);
3021 rt = fib6_info_alloc(gfp_flags);
3025 if (cfg->fc_flags & RTF_ADDRCONF)
3026 rt->dst_nocount = true;
3028 err = ip6_convert_metrics(net, rt, cfg);
3032 if (cfg->fc_flags & RTF_EXPIRES)
3033 fib6_set_expires(rt, jiffies +
3034 clock_t_to_jiffies(cfg->fc_expires));
3036 fib6_clean_expires(rt);
3038 if (cfg->fc_protocol == RTPROT_UNSPEC)
3039 cfg->fc_protocol = RTPROT_BOOT;
3040 rt->fib6_protocol = cfg->fc_protocol;
3042 addr_type = ipv6_addr_type(&cfg->fc_dst);
3044 if (cfg->fc_encap) {
3045 struct lwtunnel_state *lwtstate;
3047 err = lwtunnel_build_state(cfg->fc_encap_type,
3048 cfg->fc_encap, AF_INET6, cfg,
3052 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3055 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3056 rt->fib6_dst.plen = cfg->fc_dst_len;
3057 if (rt->fib6_dst.plen == 128)
3058 rt->dst_host = true;
3060 #ifdef CONFIG_IPV6_SUBTREES
3061 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3062 rt->fib6_src.plen = cfg->fc_src_len;
3065 rt->fib6_metric = cfg->fc_metric;
3066 rt->fib6_nh.nh_weight = 1;
3068 rt->fib6_type = cfg->fc_type;
3070 /* We cannot add true routes via loopback here,
3071 they would result in kernel looping; promote them to reject routes
3073 if ((cfg->fc_flags & RTF_REJECT) ||
3074 (dev && (dev->flags & IFF_LOOPBACK) &&
3075 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3076 !(cfg->fc_flags & RTF_LOCAL))) {
3077 /* hold loopback dev/idev if we haven't done so. */
3078 if (dev != net->loopback_dev) {
3083 dev = net->loopback_dev;
3085 idev = in6_dev_get(dev);
3091 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3095 if (cfg->fc_flags & RTF_GATEWAY) {
3096 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3100 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3107 if (idev->cnf.disable_ipv6) {
3108 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3113 if (!(dev->flags & IFF_UP)) {
3114 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3119 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3120 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3121 NL_SET_ERR_MSG(extack, "Invalid source address");
3125 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3126 rt->fib6_prefsrc.plen = 128;
3128 rt->fib6_prefsrc.plen = 0;
3130 rt->fib6_flags = cfg->fc_flags;
3133 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3134 !netif_carrier_ok(dev))
3135 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3136 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3137 rt->fib6_nh.nh_dev = dev;
3138 rt->fib6_table = table;
3140 cfg->fc_nlinfo.nl_net = dev_net(dev);
3152 fib6_info_release(rt);
3153 return ERR_PTR(err);
3156 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3157 struct netlink_ext_ack *extack)
3159 struct fib6_info *rt;
3162 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3166 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3167 fib6_info_release(rt);
3172 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3174 struct net *net = info->nl_net;
3175 struct fib6_table *table;
3178 if (rt == net->ipv6.fib6_null_entry) {
3183 table = rt->fib6_table;
3184 spin_lock_bh(&table->tb6_lock);
3185 err = fib6_del(rt, info);
3186 spin_unlock_bh(&table->tb6_lock);
3189 fib6_info_release(rt);
3193 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3195 struct nl_info info = { .nl_net = net };
3197 return __ip6_del_rt(rt, &info);
3200 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3202 struct nl_info *info = &cfg->fc_nlinfo;
3203 struct net *net = info->nl_net;
3204 struct sk_buff *skb = NULL;
3205 struct fib6_table *table;
3208 if (rt == net->ipv6.fib6_null_entry)
3210 table = rt->fib6_table;
3211 spin_lock_bh(&table->tb6_lock);
3213 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3214 struct fib6_info *sibling, *next_sibling;
3216 /* prefer to send a single notification with all hops */
3217 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3219 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3221 if (rt6_fill_node(net, skb, rt, NULL,
3222 NULL, NULL, 0, RTM_DELROUTE,
3223 info->portid, seq, 0) < 0) {
3227 info->skip_notify = 1;
3230 list_for_each_entry_safe(sibling, next_sibling,
3233 err = fib6_del(sibling, info);
3239 err = fib6_del(rt, info);
3241 spin_unlock_bh(&table->tb6_lock);
3243 fib6_info_release(rt);
3246 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3247 info->nlh, gfp_any());
3252 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3256 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3259 if (cfg->fc_flags & RTF_GATEWAY &&
3260 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3262 if (dst_hold_safe(&rt->dst))
3263 rc = rt6_remove_exception_rt(rt);
3268 static int ip6_route_del(struct fib6_config *cfg,
3269 struct netlink_ext_ack *extack)
3271 struct rt6_info *rt_cache;
3272 struct fib6_table *table;
3273 struct fib6_info *rt;
3274 struct fib6_node *fn;
3277 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3279 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3285 fn = fib6_locate(&table->tb6_root,
3286 &cfg->fc_dst, cfg->fc_dst_len,
3287 &cfg->fc_src, cfg->fc_src_len,
3288 !(cfg->fc_flags & RTF_CACHE));
3291 for_each_fib6_node_rt_rcu(fn) {
3292 if (cfg->fc_flags & RTF_CACHE) {
3295 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3298 rc = ip6_del_cached_rt(rt_cache, cfg);
3306 if (cfg->fc_ifindex &&
3307 (!rt->fib6_nh.nh_dev ||
3308 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3310 if (cfg->fc_flags & RTF_GATEWAY &&
3311 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3313 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3315 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3317 if (!fib6_info_hold_safe(rt))
3321 /* if gateway was specified only delete the one hop */
3322 if (cfg->fc_flags & RTF_GATEWAY)
3323 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3325 return __ip6_del_rt_siblings(rt, cfg);
3333 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3335 struct netevent_redirect netevent;
3336 struct rt6_info *rt, *nrt = NULL;
3337 struct ndisc_options ndopts;
3338 struct inet6_dev *in6_dev;
3339 struct neighbour *neigh;
3340 struct fib6_info *from;
3342 int optlen, on_link;
3345 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3346 optlen -= sizeof(*msg);
3349 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3353 msg = (struct rd_msg *)icmp6_hdr(skb);
3355 if (ipv6_addr_is_multicast(&msg->dest)) {
3356 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3361 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3363 } else if (ipv6_addr_type(&msg->target) !=
3364 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3365 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3369 in6_dev = __in6_dev_get(skb->dev);
3372 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3376 * The IP source address of the Redirect MUST be the same as the current
3377 * first-hop router for the specified ICMP Destination Address.
3380 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3381 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3386 if (ndopts.nd_opts_tgt_lladdr) {
3387 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3390 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3395 rt = (struct rt6_info *) dst;
3396 if (rt->rt6i_flags & RTF_REJECT) {
3397 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3401 /* Redirect received -> path was valid.
3402 * Look, redirects are sent only in response to data packets,
3403 * so that this nexthop apparently is reachable. --ANK
3405 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3407 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3412 * We have finally decided to accept it.
3415 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3416 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3417 NEIGH_UPDATE_F_OVERRIDE|
3418 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3419 NEIGH_UPDATE_F_ISROUTER)),
3420 NDISC_REDIRECT, &ndopts);
3423 from = rcu_dereference(rt->from);
3424 /* This fib6_info_hold() is safe here because we hold reference to rt
3425 * and rt already holds reference to fib6_info.
3427 fib6_info_hold(from);
3430 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3434 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3436 nrt->rt6i_flags &= ~RTF_GATEWAY;
3438 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3440 /* No need to remove rt from the exception table if rt is
3441 * a cached route because rt6_insert_exception() will
3444 if (rt6_insert_exception(nrt, from)) {
3445 dst_release_immediate(&nrt->dst);
3449 netevent.old = &rt->dst;
3450 netevent.new = &nrt->dst;
3451 netevent.daddr = &msg->dest;
3452 netevent.neigh = neigh;
3453 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3456 fib6_info_release(from);
3457 neigh_release(neigh);
3460 #ifdef CONFIG_IPV6_ROUTE_INFO
3461 static struct fib6_info *rt6_get_route_info(struct net *net,
3462 const struct in6_addr *prefix, int prefixlen,
3463 const struct in6_addr *gwaddr,
3464 struct net_device *dev)
3466 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3467 int ifindex = dev->ifindex;
3468 struct fib6_node *fn;
3469 struct fib6_info *rt = NULL;
3470 struct fib6_table *table;
3472 table = fib6_get_table(net, tb_id);
3477 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3481 for_each_fib6_node_rt_rcu(fn) {
3482 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3484 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3486 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3488 if (!fib6_info_hold_safe(rt))
3497 static struct fib6_info *rt6_add_route_info(struct net *net,
3498 const struct in6_addr *prefix, int prefixlen,
3499 const struct in6_addr *gwaddr,
3500 struct net_device *dev,
3503 struct fib6_config cfg = {
3504 .fc_metric = IP6_RT_PRIO_USER,
3505 .fc_ifindex = dev->ifindex,
3506 .fc_dst_len = prefixlen,
3507 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3508 RTF_UP | RTF_PREF(pref),
3509 .fc_protocol = RTPROT_RA,
3510 .fc_type = RTN_UNICAST,
3511 .fc_nlinfo.portid = 0,
3512 .fc_nlinfo.nlh = NULL,
3513 .fc_nlinfo.nl_net = net,
3516 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3517 cfg.fc_dst = *prefix;
3518 cfg.fc_gateway = *gwaddr;
3520 /* We should treat it as a default route if prefix length is 0. */
3522 cfg.fc_flags |= RTF_DEFAULT;
3524 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3526 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3530 struct fib6_info *rt6_get_dflt_router(struct net *net,
3531 const struct in6_addr *addr,
3532 struct net_device *dev)
3534 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3535 struct fib6_info *rt;
3536 struct fib6_table *table;
3538 table = fib6_get_table(net, tb_id);
3543 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3544 if (dev == rt->fib6_nh.nh_dev &&
3545 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3546 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3549 if (rt && !fib6_info_hold_safe(rt))
3555 struct fib6_info *rt6_add_dflt_router(struct net *net,
3556 const struct in6_addr *gwaddr,
3557 struct net_device *dev,
3560 struct fib6_config cfg = {
3561 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3562 .fc_metric = IP6_RT_PRIO_USER,
3563 .fc_ifindex = dev->ifindex,
3564 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3565 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3566 .fc_protocol = RTPROT_RA,
3567 .fc_type = RTN_UNICAST,
3568 .fc_nlinfo.portid = 0,
3569 .fc_nlinfo.nlh = NULL,
3570 .fc_nlinfo.nl_net = net,
3573 cfg.fc_gateway = *gwaddr;
3575 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3576 struct fib6_table *table;
3578 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3580 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3583 return rt6_get_dflt_router(net, gwaddr, dev);
3586 static void __rt6_purge_dflt_routers(struct net *net,
3587 struct fib6_table *table)
3589 struct fib6_info *rt;
3593 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3594 struct net_device *dev = fib6_info_nh_dev(rt);
3595 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3597 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3598 (!idev || idev->cnf.accept_ra != 2) &&
3599 fib6_info_hold_safe(rt)) {
3601 ip6_del_rt(net, rt);
3607 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3610 void rt6_purge_dflt_routers(struct net *net)
3612 struct fib6_table *table;
3613 struct hlist_head *head;
3618 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3619 head = &net->ipv6.fib_table_hash[h];
3620 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3621 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3622 __rt6_purge_dflt_routers(net, table);
3629 static void rtmsg_to_fib6_config(struct net *net,
3630 struct in6_rtmsg *rtmsg,
3631 struct fib6_config *cfg)
3633 memset(cfg, 0, sizeof(*cfg));
3635 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3637 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3638 cfg->fc_metric = rtmsg->rtmsg_metric;
3639 cfg->fc_expires = rtmsg->rtmsg_info;
3640 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3641 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3642 cfg->fc_flags = rtmsg->rtmsg_flags;
3643 cfg->fc_type = rtmsg->rtmsg_type;
3645 cfg->fc_nlinfo.nl_net = net;
3647 cfg->fc_dst = rtmsg->rtmsg_dst;
3648 cfg->fc_src = rtmsg->rtmsg_src;
3649 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3652 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3654 struct fib6_config cfg;
3655 struct in6_rtmsg rtmsg;
3659 case SIOCADDRT: /* Add a route */
3660 case SIOCDELRT: /* Delete a route */
3661 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3663 err = copy_from_user(&rtmsg, arg,
3664 sizeof(struct in6_rtmsg));
3668 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3673 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3676 err = ip6_route_del(&cfg, NULL);
3690 * Drop the packet on the floor
3693 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3696 struct dst_entry *dst = skb_dst(skb);
3697 switch (ipstats_mib_noroutes) {
3698 case IPSTATS_MIB_INNOROUTES:
3699 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3700 if (type == IPV6_ADDR_ANY) {
3701 IP6_INC_STATS(dev_net(dst->dev),
3702 __in6_dev_get_safely(skb->dev),
3703 IPSTATS_MIB_INADDRERRORS);
3707 case IPSTATS_MIB_OUTNOROUTES:
3708 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3709 ipstats_mib_noroutes);
3712 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3717 static int ip6_pkt_discard(struct sk_buff *skb)
3719 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3722 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3724 skb->dev = skb_dst(skb)->dev;
3725 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3728 static int ip6_pkt_prohibit(struct sk_buff *skb)
3730 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3733 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3735 skb->dev = skb_dst(skb)->dev;
3736 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3740 * Allocate a dst for local (unicast / anycast) address.
3743 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3744 struct inet6_dev *idev,
3745 const struct in6_addr *addr,
3746 bool anycast, gfp_t gfp_flags)
3749 struct net_device *dev = idev->dev;
3750 struct fib6_info *f6i;
3752 f6i = fib6_info_alloc(gfp_flags);
3754 return ERR_PTR(-ENOMEM);
3756 f6i->dst_nocount = true;
3757 f6i->dst_host = true;
3758 f6i->fib6_protocol = RTPROT_KERNEL;
3759 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3761 f6i->fib6_type = RTN_ANYCAST;
3762 f6i->fib6_flags |= RTF_ANYCAST;
3764 f6i->fib6_type = RTN_LOCAL;
3765 f6i->fib6_flags |= RTF_LOCAL;
3768 f6i->fib6_nh.nh_gw = *addr;
3770 f6i->fib6_nh.nh_dev = dev;
3771 f6i->fib6_dst.addr = *addr;
3772 f6i->fib6_dst.plen = 128;
3773 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3774 f6i->fib6_table = fib6_get_table(net, tb_id);
3779 /* remove deleted ip from prefsrc entries */
3780 struct arg_dev_net_ip {
3781 struct net_device *dev;
3783 struct in6_addr *addr;
3786 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3788 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3789 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3790 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3792 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3793 rt != net->ipv6.fib6_null_entry &&
3794 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3795 spin_lock_bh(&rt6_exception_lock);
3796 /* remove prefsrc entry */
3797 rt->fib6_prefsrc.plen = 0;
3798 /* need to update cache as well */
3799 rt6_exceptions_remove_prefsrc(rt);
3800 spin_unlock_bh(&rt6_exception_lock);
3805 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3807 struct net *net = dev_net(ifp->idev->dev);
3808 struct arg_dev_net_ip adni = {
3809 .dev = ifp->idev->dev,
3813 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3816 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3818 /* Remove routers and update dst entries when gateway turn into host. */
3819 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3821 struct in6_addr *gateway = (struct in6_addr *)arg;
3823 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3824 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3828 /* Further clean up cached routes in exception table.
3829 * This is needed because cached route may have a different
3830 * gateway than its 'parent' in the case of an ip redirect.
3832 rt6_exceptions_clean_tohost(rt, gateway);
3837 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3839 fib6_clean_all(net, fib6_clean_tohost, gateway);
3842 struct arg_netdev_event {
3843 const struct net_device *dev;
3845 unsigned int nh_flags;
3846 unsigned long event;
3850 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3852 struct fib6_info *iter;
3853 struct fib6_node *fn;
3855 fn = rcu_dereference_protected(rt->fib6_node,
3856 lockdep_is_held(&rt->fib6_table->tb6_lock));
3857 iter = rcu_dereference_protected(fn->leaf,
3858 lockdep_is_held(&rt->fib6_table->tb6_lock));
3860 if (iter->fib6_metric == rt->fib6_metric &&
3861 rt6_qualify_for_ecmp(iter))
3863 iter = rcu_dereference_protected(iter->fib6_next,
3864 lockdep_is_held(&rt->fib6_table->tb6_lock));
3870 static bool rt6_is_dead(const struct fib6_info *rt)
3872 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3873 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3874 fib6_ignore_linkdown(rt)))
3880 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3882 struct fib6_info *iter;
3885 if (!rt6_is_dead(rt))
3886 total += rt->fib6_nh.nh_weight;
3888 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3889 if (!rt6_is_dead(iter))
3890 total += iter->fib6_nh.nh_weight;
3896 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3898 int upper_bound = -1;
3900 if (!rt6_is_dead(rt)) {
3901 *weight += rt->fib6_nh.nh_weight;
3902 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3905 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3908 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3910 struct fib6_info *iter;
3913 rt6_upper_bound_set(rt, &weight, total);
3915 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3916 rt6_upper_bound_set(iter, &weight, total);
3919 void rt6_multipath_rebalance(struct fib6_info *rt)
3921 struct fib6_info *first;
3924 /* In case the entire multipath route was marked for flushing,
3925 * then there is no need to rebalance upon the removal of every
3928 if (!rt->fib6_nsiblings || rt->should_flush)
3931 /* During lookup routes are evaluated in order, so we need to
3932 * make sure upper bounds are assigned from the first sibling
3935 first = rt6_multipath_first_sibling(rt);
3936 if (WARN_ON_ONCE(!first))
3939 total = rt6_multipath_total_weight(first);
3940 rt6_multipath_upper_bound_set(first, total);
3943 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3945 const struct arg_netdev_event *arg = p_arg;
3946 struct net *net = dev_net(arg->dev);
3948 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3949 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3950 fib6_update_sernum_upto_root(net, rt);
3951 rt6_multipath_rebalance(rt);
3957 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3959 struct arg_netdev_event arg = {
3962 .nh_flags = nh_flags,
3966 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3967 arg.nh_flags |= RTNH_F_LINKDOWN;
3969 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3972 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3973 const struct net_device *dev)
3975 struct fib6_info *iter;
3977 if (rt->fib6_nh.nh_dev == dev)
3979 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980 if (iter->fib6_nh.nh_dev == dev)
3986 static void rt6_multipath_flush(struct fib6_info *rt)
3988 struct fib6_info *iter;
3990 rt->should_flush = 1;
3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992 iter->should_flush = 1;
3995 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3996 const struct net_device *down_dev)
3998 struct fib6_info *iter;
3999 unsigned int dead = 0;
4001 if (rt->fib6_nh.nh_dev == down_dev ||
4002 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4004 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005 if (iter->fib6_nh.nh_dev == down_dev ||
4006 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4012 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4013 const struct net_device *dev,
4014 unsigned int nh_flags)
4016 struct fib6_info *iter;
4018 if (rt->fib6_nh.nh_dev == dev)
4019 rt->fib6_nh.nh_flags |= nh_flags;
4020 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4021 if (iter->fib6_nh.nh_dev == dev)
4022 iter->fib6_nh.nh_flags |= nh_flags;
4025 /* called with write lock held for table with rt */
4026 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4028 const struct arg_netdev_event *arg = p_arg;
4029 const struct net_device *dev = arg->dev;
4030 struct net *net = dev_net(dev);
4032 if (rt == net->ipv6.fib6_null_entry)
4035 switch (arg->event) {
4036 case NETDEV_UNREGISTER:
4037 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4039 if (rt->should_flush)
4041 if (!rt->fib6_nsiblings)
4042 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4043 if (rt6_multipath_uses_dev(rt, dev)) {
4046 count = rt6_multipath_dead_count(rt, dev);
4047 if (rt->fib6_nsiblings + 1 == count) {
4048 rt6_multipath_flush(rt);
4051 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4053 fib6_update_sernum(net, rt);
4054 rt6_multipath_rebalance(rt);
4058 if (rt->fib6_nh.nh_dev != dev ||
4059 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4061 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4062 rt6_multipath_rebalance(rt);
4069 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4071 struct arg_netdev_event arg = {
4078 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4081 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4083 rt6_sync_down_dev(dev, event);
4084 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4085 neigh_ifdown(&nd_tbl, dev);
4088 struct rt6_mtu_change_arg {
4089 struct net_device *dev;
4093 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4095 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4096 struct inet6_dev *idev;
4098 /* In IPv6 pmtu discovery is not optional,
4099 so that RTAX_MTU lock cannot disable it.
4100 We still use this lock to block changes
4101 caused by addrconf/ndisc.
4104 idev = __in6_dev_get(arg->dev);
4108 /* For administrative MTU increase, there is no way to discover
4109 IPv6 PMTU increase, so PMTU increase should be updated here.
4110 Since RFC 1981 doesn't include administrative MTU increase
4111 update PMTU increase is a MUST. (i.e. jumbo frame)
4113 if (rt->fib6_nh.nh_dev == arg->dev &&
4114 !fib6_metric_locked(rt, RTAX_MTU)) {
4115 u32 mtu = rt->fib6_pmtu;
4117 if (mtu >= arg->mtu ||
4118 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4119 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4121 spin_lock_bh(&rt6_exception_lock);
4122 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4123 spin_unlock_bh(&rt6_exception_lock);
4128 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4130 struct rt6_mtu_change_arg arg = {
4135 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4138 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4139 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4140 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4141 [RTA_OIF] = { .type = NLA_U32 },
4142 [RTA_IIF] = { .type = NLA_U32 },
4143 [RTA_PRIORITY] = { .type = NLA_U32 },
4144 [RTA_METRICS] = { .type = NLA_NESTED },
4145 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4146 [RTA_PREF] = { .type = NLA_U8 },
4147 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4148 [RTA_ENCAP] = { .type = NLA_NESTED },
4149 [RTA_EXPIRES] = { .type = NLA_U32 },
4150 [RTA_UID] = { .type = NLA_U32 },
4151 [RTA_MARK] = { .type = NLA_U32 },
4152 [RTA_TABLE] = { .type = NLA_U32 },
4153 [RTA_IP_PROTO] = { .type = NLA_U8 },
4154 [RTA_SPORT] = { .type = NLA_U16 },
4155 [RTA_DPORT] = { .type = NLA_U16 },
4158 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4159 struct fib6_config *cfg,
4160 struct netlink_ext_ack *extack)
4163 struct nlattr *tb[RTA_MAX+1];
4167 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4173 rtm = nlmsg_data(nlh);
4174 memset(cfg, 0, sizeof(*cfg));
4176 cfg->fc_table = rtm->rtm_table;
4177 cfg->fc_dst_len = rtm->rtm_dst_len;
4178 cfg->fc_src_len = rtm->rtm_src_len;
4179 cfg->fc_flags = RTF_UP;
4180 cfg->fc_protocol = rtm->rtm_protocol;
4181 cfg->fc_type = rtm->rtm_type;
4183 if (rtm->rtm_type == RTN_UNREACHABLE ||
4184 rtm->rtm_type == RTN_BLACKHOLE ||
4185 rtm->rtm_type == RTN_PROHIBIT ||
4186 rtm->rtm_type == RTN_THROW)
4187 cfg->fc_flags |= RTF_REJECT;
4189 if (rtm->rtm_type == RTN_LOCAL)
4190 cfg->fc_flags |= RTF_LOCAL;
4192 if (rtm->rtm_flags & RTM_F_CLONED)
4193 cfg->fc_flags |= RTF_CACHE;
4195 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4197 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4198 cfg->fc_nlinfo.nlh = nlh;
4199 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4201 if (tb[RTA_GATEWAY]) {
4202 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4203 cfg->fc_flags |= RTF_GATEWAY;
4207 int plen = (rtm->rtm_dst_len + 7) >> 3;
4209 if (nla_len(tb[RTA_DST]) < plen)
4212 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4216 int plen = (rtm->rtm_src_len + 7) >> 3;
4218 if (nla_len(tb[RTA_SRC]) < plen)
4221 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4224 if (tb[RTA_PREFSRC])
4225 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4228 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4230 if (tb[RTA_PRIORITY])
4231 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4233 if (tb[RTA_METRICS]) {
4234 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4235 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4239 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4241 if (tb[RTA_MULTIPATH]) {
4242 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4243 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4245 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4246 cfg->fc_mp_len, extack);
4252 pref = nla_get_u8(tb[RTA_PREF]);
4253 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4254 pref != ICMPV6_ROUTER_PREF_HIGH)
4255 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4256 cfg->fc_flags |= RTF_PREF(pref);
4260 cfg->fc_encap = tb[RTA_ENCAP];
4262 if (tb[RTA_ENCAP_TYPE]) {
4263 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4265 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4270 if (tb[RTA_EXPIRES]) {
4271 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4273 if (addrconf_finite_timeout(timeout)) {
4274 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4275 cfg->fc_flags |= RTF_EXPIRES;
4285 struct fib6_info *fib6_info;
4286 struct fib6_config r_cfg;
4287 struct list_head next;
4290 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4294 list_for_each_entry(nh, rt6_nh_list, next) {
4295 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4296 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4297 nh->r_cfg.fc_ifindex);
4301 static int ip6_route_info_append(struct net *net,
4302 struct list_head *rt6_nh_list,
4303 struct fib6_info *rt,
4304 struct fib6_config *r_cfg)
4309 list_for_each_entry(nh, rt6_nh_list, next) {
4310 /* check if fib6_info already exists */
4311 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4315 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4319 err = ip6_convert_metrics(net, rt, r_cfg);
4324 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4325 list_add_tail(&nh->next, rt6_nh_list);
4330 static void ip6_route_mpath_notify(struct fib6_info *rt,
4331 struct fib6_info *rt_last,
4332 struct nl_info *info,
4335 /* if this is an APPEND route, then rt points to the first route
4336 * inserted and rt_last points to last route inserted. Userspace
4337 * wants a consistent dump of the route which starts at the first
4338 * nexthop. Since sibling routes are always added at the end of
4339 * the list, find the first sibling of the last route appended
4341 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4342 rt = list_first_entry(&rt_last->fib6_siblings,
4348 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4351 static int ip6_route_multipath_add(struct fib6_config *cfg,
4352 struct netlink_ext_ack *extack)
4354 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4355 struct nl_info *info = &cfg->fc_nlinfo;
4356 struct fib6_config r_cfg;
4357 struct rtnexthop *rtnh;
4358 struct fib6_info *rt;
4359 struct rt6_nh *err_nh;
4360 struct rt6_nh *nh, *nh_safe;
4366 int replace = (cfg->fc_nlinfo.nlh &&
4367 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4368 LIST_HEAD(rt6_nh_list);
4370 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4371 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4372 nlflags |= NLM_F_APPEND;
4374 remaining = cfg->fc_mp_len;
4375 rtnh = (struct rtnexthop *)cfg->fc_mp;
4377 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4378 * fib6_info structs per nexthop
4380 while (rtnh_ok(rtnh, remaining)) {
4381 memcpy(&r_cfg, cfg, sizeof(*cfg));
4382 if (rtnh->rtnh_ifindex)
4383 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4385 attrlen = rtnh_attrlen(rtnh);
4387 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4389 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4391 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4392 r_cfg.fc_flags |= RTF_GATEWAY;
4394 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4395 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4397 r_cfg.fc_encap_type = nla_get_u16(nla);
4400 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4401 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4407 if (!rt6_qualify_for_ecmp(rt)) {
4409 NL_SET_ERR_MSG(extack,
4410 "Device only routes can not be added for IPv6 using the multipath API.");
4411 fib6_info_release(rt);
4415 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4417 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4420 fib6_info_release(rt);
4424 rtnh = rtnh_next(rtnh, &remaining);
4427 /* for add and replace send one notification with all nexthops.
4428 * Skip the notification in fib6_add_rt2node and send one with
4429 * the full route when done
4431 info->skip_notify = 1;
4434 list_for_each_entry(nh, &rt6_nh_list, next) {
4435 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4436 fib6_info_release(nh->fib6_info);
4439 /* save reference to last route successfully inserted */
4440 rt_last = nh->fib6_info;
4442 /* save reference to first route for notification */
4444 rt_notif = nh->fib6_info;
4447 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4448 nh->fib6_info = NULL;
4451 ip6_print_replace_route_err(&rt6_nh_list);
4456 /* Because each route is added like a single route we remove
4457 * these flags after the first nexthop: if there is a collision,
4458 * we have already failed to add the first nexthop:
4459 * fib6_add_rt2node() has rejected it; when replacing, old
4460 * nexthops have been replaced by first new, the rest should
4463 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4468 /* success ... tell user about new route */
4469 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4473 /* send notification for routes that were added so that
4474 * the delete notifications sent by ip6_route_del are
4478 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4480 /* Delete routes that were already added */
4481 list_for_each_entry(nh, &rt6_nh_list, next) {
4484 ip6_route_del(&nh->r_cfg, extack);
4488 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4490 fib6_info_release(nh->fib6_info);
4491 list_del(&nh->next);
4498 static int ip6_route_multipath_del(struct fib6_config *cfg,
4499 struct netlink_ext_ack *extack)
4501 struct fib6_config r_cfg;
4502 struct rtnexthop *rtnh;
4505 int err = 1, last_err = 0;
4507 remaining = cfg->fc_mp_len;
4508 rtnh = (struct rtnexthop *)cfg->fc_mp;
4510 /* Parse a Multipath Entry */
4511 while (rtnh_ok(rtnh, remaining)) {
4512 memcpy(&r_cfg, cfg, sizeof(*cfg));
4513 if (rtnh->rtnh_ifindex)
4514 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4516 attrlen = rtnh_attrlen(rtnh);
4518 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4520 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4522 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4523 r_cfg.fc_flags |= RTF_GATEWAY;
4526 err = ip6_route_del(&r_cfg, extack);
4530 rtnh = rtnh_next(rtnh, &remaining);
4536 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4537 struct netlink_ext_ack *extack)
4539 struct fib6_config cfg;
4542 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4547 return ip6_route_multipath_del(&cfg, extack);
4549 cfg.fc_delete_all_nh = 1;
4550 return ip6_route_del(&cfg, extack);
4554 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4555 struct netlink_ext_ack *extack)
4557 struct fib6_config cfg;
4560 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4565 return ip6_route_multipath_add(&cfg, extack);
4567 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4570 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4572 int nexthop_len = 0;
4574 if (rt->fib6_nsiblings) {
4575 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4576 + NLA_ALIGN(sizeof(struct rtnexthop))
4577 + nla_total_size(16) /* RTA_GATEWAY */
4578 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4580 nexthop_len *= rt->fib6_nsiblings;
4583 return NLMSG_ALIGN(sizeof(struct rtmsg))
4584 + nla_total_size(16) /* RTA_SRC */
4585 + nla_total_size(16) /* RTA_DST */
4586 + nla_total_size(16) /* RTA_GATEWAY */
4587 + nla_total_size(16) /* RTA_PREFSRC */
4588 + nla_total_size(4) /* RTA_TABLE */
4589 + nla_total_size(4) /* RTA_IIF */
4590 + nla_total_size(4) /* RTA_OIF */
4591 + nla_total_size(4) /* RTA_PRIORITY */
4592 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4593 + nla_total_size(sizeof(struct rta_cacheinfo))
4594 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4595 + nla_total_size(1) /* RTA_PREF */
4596 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4600 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4601 unsigned int *flags, bool skip_oif)
4603 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4604 *flags |= RTNH_F_DEAD;
4606 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4607 *flags |= RTNH_F_LINKDOWN;
4610 if (fib6_ignore_linkdown(rt))
4611 *flags |= RTNH_F_DEAD;
4615 if (rt->fib6_flags & RTF_GATEWAY) {
4616 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4617 goto nla_put_failure;
4620 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4621 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4622 *flags |= RTNH_F_OFFLOAD;
4624 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4625 if (!skip_oif && rt->fib6_nh.nh_dev &&
4626 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4627 goto nla_put_failure;
4629 if (rt->fib6_nh.nh_lwtstate &&
4630 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4631 goto nla_put_failure;
4639 /* add multipath next hop */
4640 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4642 const struct net_device *dev = rt->fib6_nh.nh_dev;
4643 struct rtnexthop *rtnh;
4644 unsigned int flags = 0;
4646 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4648 goto nla_put_failure;
4650 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4651 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4653 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4654 goto nla_put_failure;
4656 rtnh->rtnh_flags = flags;
4658 /* length of rtnetlink header + attributes */
4659 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4667 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4668 struct fib6_info *rt, struct dst_entry *dst,
4669 struct in6_addr *dest, struct in6_addr *src,
4670 int iif, int type, u32 portid, u32 seq,
4674 struct nlmsghdr *nlh;
4679 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4683 rtm = nlmsg_data(nlh);
4684 rtm->rtm_family = AF_INET6;
4685 rtm->rtm_dst_len = rt->fib6_dst.plen;
4686 rtm->rtm_src_len = rt->fib6_src.plen;
4689 table = rt->fib6_table->tb6_id;
4691 table = RT6_TABLE_UNSPEC;
4692 rtm->rtm_table = table;
4693 if (nla_put_u32(skb, RTA_TABLE, table))
4694 goto nla_put_failure;
4696 rtm->rtm_type = rt->fib6_type;
4698 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4699 rtm->rtm_protocol = rt->fib6_protocol;
4701 if (rt->fib6_flags & RTF_CACHE)
4702 rtm->rtm_flags |= RTM_F_CLONED;
4705 if (nla_put_in6_addr(skb, RTA_DST, dest))
4706 goto nla_put_failure;
4707 rtm->rtm_dst_len = 128;
4708 } else if (rtm->rtm_dst_len)
4709 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4710 goto nla_put_failure;
4711 #ifdef CONFIG_IPV6_SUBTREES
4713 if (nla_put_in6_addr(skb, RTA_SRC, src))
4714 goto nla_put_failure;
4715 rtm->rtm_src_len = 128;
4716 } else if (rtm->rtm_src_len &&
4717 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4718 goto nla_put_failure;
4721 #ifdef CONFIG_IPV6_MROUTE
4722 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4723 int err = ip6mr_get_route(net, skb, rtm, portid);
4728 goto nla_put_failure;
4731 if (nla_put_u32(skb, RTA_IIF, iif))
4732 goto nla_put_failure;
4734 struct in6_addr saddr_buf;
4735 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4736 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4737 goto nla_put_failure;
4740 if (rt->fib6_prefsrc.plen) {
4741 struct in6_addr saddr_buf;
4742 saddr_buf = rt->fib6_prefsrc.addr;
4743 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4744 goto nla_put_failure;
4747 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4748 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4749 goto nla_put_failure;
4751 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4752 goto nla_put_failure;
4754 /* For multipath routes, walk the siblings list and add
4755 * each as a nexthop within RTA_MULTIPATH.
4757 if (rt->fib6_nsiblings) {
4758 struct fib6_info *sibling, *next_sibling;
4761 mp = nla_nest_start(skb, RTA_MULTIPATH);
4763 goto nla_put_failure;
4765 if (rt6_add_nexthop(skb, rt) < 0)
4766 goto nla_put_failure;
4768 list_for_each_entry_safe(sibling, next_sibling,
4769 &rt->fib6_siblings, fib6_siblings) {
4770 if (rt6_add_nexthop(skb, sibling) < 0)
4771 goto nla_put_failure;
4774 nla_nest_end(skb, mp);
4776 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4777 goto nla_put_failure;
4780 if (rt->fib6_flags & RTF_EXPIRES) {
4781 expires = dst ? dst->expires : rt->expires;
4785 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4786 goto nla_put_failure;
4788 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4789 goto nla_put_failure;
4792 nlmsg_end(skb, nlh);
4796 nlmsg_cancel(skb, nlh);
4800 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4802 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4803 struct net *net = arg->net;
4805 if (rt == net->ipv6.fib6_null_entry)
4808 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4809 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4811 /* user wants prefix routes only */
4812 if (rtm->rtm_flags & RTM_F_PREFIX &&
4813 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4814 /* success since this is not a prefix route */
4819 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4820 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4821 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4824 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4825 struct netlink_ext_ack *extack)
4827 struct net *net = sock_net(in_skb->sk);
4828 struct nlattr *tb[RTA_MAX+1];
4829 int err, iif = 0, oif = 0;
4830 struct fib6_info *from;
4831 struct dst_entry *dst;
4832 struct rt6_info *rt;
4833 struct sk_buff *skb;
4838 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4844 memset(&fl6, 0, sizeof(fl6));
4845 rtm = nlmsg_data(nlh);
4846 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4847 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4850 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4853 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4857 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4860 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4864 iif = nla_get_u32(tb[RTA_IIF]);
4867 oif = nla_get_u32(tb[RTA_OIF]);
4870 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4873 fl6.flowi6_uid = make_kuid(current_user_ns(),
4874 nla_get_u32(tb[RTA_UID]));
4876 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4879 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4882 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4884 if (tb[RTA_IP_PROTO]) {
4885 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4886 &fl6.flowi6_proto, extack);
4892 struct net_device *dev;
4897 dev = dev_get_by_index_rcu(net, iif);
4904 fl6.flowi6_iif = iif;
4906 if (!ipv6_addr_any(&fl6.saddr))
4907 flags |= RT6_LOOKUP_F_HAS_SADDR;
4909 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4913 fl6.flowi6_oif = oif;
4915 dst = ip6_route_output(net, NULL, &fl6);
4919 rt = container_of(dst, struct rt6_info, dst);
4920 if (rt->dst.error) {
4921 err = rt->dst.error;
4926 if (rt == net->ipv6.ip6_null_entry) {
4927 err = rt->dst.error;
4932 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4939 skb_dst_set(skb, &rt->dst);
4942 from = rcu_dereference(rt->from);
4945 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4946 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4949 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4950 &fl6.saddr, iif, RTM_NEWROUTE,
4951 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4960 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4965 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4966 unsigned int nlm_flags)
4968 struct sk_buff *skb;
4969 struct net *net = info->nl_net;
4974 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4976 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4980 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4981 event, info->portid, seq, nlm_flags);
4983 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4984 WARN_ON(err == -EMSGSIZE);
4988 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4989 info->nlh, gfp_any());
4993 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4996 static int ip6_route_dev_notify(struct notifier_block *this,
4997 unsigned long event, void *ptr)
4999 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5000 struct net *net = dev_net(dev);
5002 if (!(dev->flags & IFF_LOOPBACK))
5005 if (event == NETDEV_REGISTER) {
5006 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5007 net->ipv6.ip6_null_entry->dst.dev = dev;
5008 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5010 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5011 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5012 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5013 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5015 } else if (event == NETDEV_UNREGISTER &&
5016 dev->reg_state != NETREG_UNREGISTERED) {
5017 /* NETDEV_UNREGISTER could be fired for multiple times by
5018 * netdev_wait_allrefs(). Make sure we only call this once.
5020 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5021 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5022 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5023 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5034 #ifdef CONFIG_PROC_FS
5035 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5037 struct net *net = (struct net *)seq->private;
5038 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5039 net->ipv6.rt6_stats->fib_nodes,
5040 net->ipv6.rt6_stats->fib_route_nodes,
5041 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5042 net->ipv6.rt6_stats->fib_rt_entries,
5043 net->ipv6.rt6_stats->fib_rt_cache,
5044 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5045 net->ipv6.rt6_stats->fib_discarded_routes);
5049 #endif /* CONFIG_PROC_FS */
5051 #ifdef CONFIG_SYSCTL
5054 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5055 void __user *buffer, size_t *lenp, loff_t *ppos)
5062 net = (struct net *)ctl->extra1;
5063 delay = net->ipv6.sysctl.flush_delay;
5064 proc_dointvec(ctl, write, buffer, lenp, ppos);
5065 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5069 struct ctl_table ipv6_route_table_template[] = {
5071 .procname = "flush",
5072 .data = &init_net.ipv6.sysctl.flush_delay,
5073 .maxlen = sizeof(int),
5075 .proc_handler = ipv6_sysctl_rtcache_flush
5078 .procname = "gc_thresh",
5079 .data = &ip6_dst_ops_template.gc_thresh,
5080 .maxlen = sizeof(int),
5082 .proc_handler = proc_dointvec,
5085 .procname = "max_size",
5086 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5087 .maxlen = sizeof(int),
5089 .proc_handler = proc_dointvec,
5092 .procname = "gc_min_interval",
5093 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5094 .maxlen = sizeof(int),
5096 .proc_handler = proc_dointvec_jiffies,
5099 .procname = "gc_timeout",
5100 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5101 .maxlen = sizeof(int),
5103 .proc_handler = proc_dointvec_jiffies,
5106 .procname = "gc_interval",
5107 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5108 .maxlen = sizeof(int),
5110 .proc_handler = proc_dointvec_jiffies,
5113 .procname = "gc_elasticity",
5114 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5115 .maxlen = sizeof(int),
5117 .proc_handler = proc_dointvec,
5120 .procname = "mtu_expires",
5121 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5122 .maxlen = sizeof(int),
5124 .proc_handler = proc_dointvec_jiffies,
5127 .procname = "min_adv_mss",
5128 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5129 .maxlen = sizeof(int),
5131 .proc_handler = proc_dointvec,
5134 .procname = "gc_min_interval_ms",
5135 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5136 .maxlen = sizeof(int),
5138 .proc_handler = proc_dointvec_ms_jiffies,
5143 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5145 struct ctl_table *table;
5147 table = kmemdup(ipv6_route_table_template,
5148 sizeof(ipv6_route_table_template),
5152 table[0].data = &net->ipv6.sysctl.flush_delay;
5153 table[0].extra1 = net;
5154 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5155 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5156 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5157 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5158 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5159 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5160 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5161 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5162 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5164 /* Don't export sysctls to unprivileged users */
5165 if (net->user_ns != &init_user_ns)
5166 table[0].procname = NULL;
5173 static int __net_init ip6_route_net_init(struct net *net)
5177 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5178 sizeof(net->ipv6.ip6_dst_ops));
5180 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5181 goto out_ip6_dst_ops;
5183 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5184 sizeof(*net->ipv6.fib6_null_entry),
5186 if (!net->ipv6.fib6_null_entry)
5187 goto out_ip6_dst_entries;
5189 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5190 sizeof(*net->ipv6.ip6_null_entry),
5192 if (!net->ipv6.ip6_null_entry)
5193 goto out_fib6_null_entry;
5194 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5195 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5196 ip6_template_metrics, true);
5198 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5199 net->ipv6.fib6_has_custom_rules = false;
5200 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5201 sizeof(*net->ipv6.ip6_prohibit_entry),
5203 if (!net->ipv6.ip6_prohibit_entry)
5204 goto out_ip6_null_entry;
5205 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5206 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5207 ip6_template_metrics, true);
5209 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5210 sizeof(*net->ipv6.ip6_blk_hole_entry),
5212 if (!net->ipv6.ip6_blk_hole_entry)
5213 goto out_ip6_prohibit_entry;
5214 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5215 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5216 ip6_template_metrics, true);
5219 net->ipv6.sysctl.flush_delay = 0;
5220 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5221 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5222 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5223 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5224 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5225 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5226 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5228 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5234 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5235 out_ip6_prohibit_entry:
5236 kfree(net->ipv6.ip6_prohibit_entry);
5238 kfree(net->ipv6.ip6_null_entry);
5240 out_fib6_null_entry:
5241 kfree(net->ipv6.fib6_null_entry);
5242 out_ip6_dst_entries:
5243 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5248 static void __net_exit ip6_route_net_exit(struct net *net)
5250 kfree(net->ipv6.fib6_null_entry);
5251 kfree(net->ipv6.ip6_null_entry);
5252 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5253 kfree(net->ipv6.ip6_prohibit_entry);
5254 kfree(net->ipv6.ip6_blk_hole_entry);
5256 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5259 static int __net_init ip6_route_net_init_late(struct net *net)
5261 #ifdef CONFIG_PROC_FS
5262 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5263 sizeof(struct ipv6_route_iter));
5264 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5265 rt6_stats_seq_show, NULL);
5270 static void __net_exit ip6_route_net_exit_late(struct net *net)
5272 #ifdef CONFIG_PROC_FS
5273 remove_proc_entry("ipv6_route", net->proc_net);
5274 remove_proc_entry("rt6_stats", net->proc_net);
5278 static struct pernet_operations ip6_route_net_ops = {
5279 .init = ip6_route_net_init,
5280 .exit = ip6_route_net_exit,
5283 static int __net_init ipv6_inetpeer_init(struct net *net)
5285 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5289 inet_peer_base_init(bp);
5290 net->ipv6.peers = bp;
5294 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5296 struct inet_peer_base *bp = net->ipv6.peers;
5298 net->ipv6.peers = NULL;
5299 inetpeer_invalidate_tree(bp);
5303 static struct pernet_operations ipv6_inetpeer_ops = {
5304 .init = ipv6_inetpeer_init,
5305 .exit = ipv6_inetpeer_exit,
5308 static struct pernet_operations ip6_route_net_late_ops = {
5309 .init = ip6_route_net_init_late,
5310 .exit = ip6_route_net_exit_late,
5313 static struct notifier_block ip6_route_dev_notifier = {
5314 .notifier_call = ip6_route_dev_notify,
5315 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5318 void __init ip6_route_init_special_entries(void)
5320 /* Registering of the loopback is done before this portion of code,
5321 * the loopback reference in rt6_info will not be taken, do it
5322 * manually for init_net */
5323 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5324 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5325 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5326 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5327 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5328 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5329 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5330 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5334 int __init ip6_route_init(void)
5340 ip6_dst_ops_template.kmem_cachep =
5341 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5342 SLAB_HWCACHE_ALIGN, NULL);
5343 if (!ip6_dst_ops_template.kmem_cachep)
5346 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5348 goto out_kmem_cache;
5350 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5352 goto out_dst_entries;
5354 ret = register_pernet_subsys(&ip6_route_net_ops);
5356 goto out_register_inetpeer;
5358 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5362 goto out_register_subsys;
5368 ret = fib6_rules_init();
5372 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5374 goto fib6_rules_init;
5376 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5377 inet6_rtm_newroute, NULL, 0);
5379 goto out_register_late_subsys;
5381 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5382 inet6_rtm_delroute, NULL, 0);
5384 goto out_register_late_subsys;
5386 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5387 inet6_rtm_getroute, NULL,
5388 RTNL_FLAG_DOIT_UNLOCKED);
5390 goto out_register_late_subsys;
5392 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5394 goto out_register_late_subsys;
5396 for_each_possible_cpu(cpu) {
5397 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5399 INIT_LIST_HEAD(&ul->head);
5400 spin_lock_init(&ul->lock);
5406 out_register_late_subsys:
5407 rtnl_unregister_all(PF_INET6);
5408 unregister_pernet_subsys(&ip6_route_net_late_ops);
5410 fib6_rules_cleanup();
5415 out_register_subsys:
5416 unregister_pernet_subsys(&ip6_route_net_ops);
5417 out_register_inetpeer:
5418 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5420 dst_entries_destroy(&ip6_dst_blackhole_ops);
5422 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5426 void ip6_route_cleanup(void)
5428 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5429 unregister_pernet_subsys(&ip6_route_net_late_ops);
5430 fib6_rules_cleanup();
5433 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5434 unregister_pernet_subsys(&ip6_route_net_ops);
5435 dst_entries_destroy(&ip6_dst_blackhole_ops);
5436 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);