2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 const struct in6_addr *daddr,
114 const struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
319 .__refcnt = ATOMIC_INIT(1),
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
370 struct rt6_info *rt = (struct rt6_info *)dst;
371 struct fib6_info *from;
372 struct inet6_dev *idev;
374 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
377 rt6_uncached_list_del(rt);
379 idev = rt->rt6i_idev;
381 rt->rt6i_idev = NULL;
385 from = xchg((__force struct fib6_info **)&rt->from, NULL);
386 fib6_info_release(from);
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 struct fib6_info *from;
418 from = rcu_dereference(rt->from);
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
436 struct fib6_info *sibling, *next_sibling;
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
454 if (rt6_score_route(sibling, oif, strict) < 0)
464 * Route lookup. rcu_read_lock() should be held.
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
473 struct fib6_info *sprt;
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486 if (dev->ifindex == oif)
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
508 static void rt6_probe_deferred(struct work_struct *w)
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
520 static void rt6_probe(struct fib6_info *rt)
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
539 nh_gw = &rt->fib6_nh.nh_gw;
540 dev = rt->fib6_nh.nh_dev;
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh->nud_state & NUD_VALID)
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
568 schedule_work(&work->work);
572 rcu_read_unlock_bh();
575 static inline void rt6_probe(struct fib6_info *rt)
581 * Default Router Selection (RFC 2461 6.3.6)
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 const struct net_device *dev = rt->fib6_nh.nh_dev;
587 if (!oif || dev->ifindex == oif)
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !(rt->fib6_flags & RTF_GATEWAY))
599 return RT6_NUD_SUCCEED;
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
612 ret = RT6_NUD_FAIL_PROBE;
614 read_unlock(&neigh->lock);
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 rcu_read_unlock_bh();
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 const struct net_device *dev = fib6_info_nh_dev(f6i);
649 const struct inet6_dev *idev = __in6_dev_get(dev);
651 rc = !!idev->cnf.ignore_routes_with_linkdown;
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658 int *mpri, struct fib6_info *match,
662 bool match_do_rr = false;
664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
667 if (fib6_ignore_linkdown(rt) &&
668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
672 if (fib6_check_expired(rt))
675 m = rt6_score_route(rt, oif, strict);
676 if (m == RT6_NUD_FAIL_DO_RR) {
678 m = 0; /* lowest valid score */
679 } else if (m == RT6_NUD_FAIL_HARD) {
683 if (strict & RT6_LOOKUP_F_REACHABLE)
686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 *do_rr = match_do_rr;
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697 struct fib6_info *leaf,
698 struct fib6_info *rr_head,
699 u32 metric, int oif, int strict,
702 struct fib6_info *rt, *match, *cont;
707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708 if (rt->fib6_metric != metric) {
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 for (rt = leaf; rt && rt != rr_head;
717 rt = rcu_dereference(rt->fib6_next)) {
718 if (rt->fib6_metric != metric) {
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
738 struct fib6_info *leaf = rcu_dereference(fn->leaf);
739 struct fib6_info *match, *rt0;
743 if (!leaf || leaf == net->ipv6.fib6_null_entry)
744 return net->ipv6.fib6_null_entry;
746 rt0 = rcu_dereference(fn->rr_ptr);
750 /* Double check to make sure fn is not an intermediate node
751 * and fn->leaf does not points to its child's leaf
752 * (This might happen if all routes under fn are deleted from
753 * the tree and fib6_repair_tree() is called on the node.)
755 key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757 if (rt0->fib6_src.plen)
758 key_plen = rt0->fib6_src.plen;
760 if (fn->fn_bit != key_plen)
761 return net->ipv6.fib6_null_entry;
763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
767 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 /* no entries matched; do round-robin */
770 if (!next || next->fib6_metric != rt0->fib6_metric)
774 spin_lock_bh(&leaf->fib6_table->tb6_lock);
775 /* make sure next is not being deleted from the tree */
777 rcu_assign_pointer(fn->rr_ptr, next);
778 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
782 return match ? match : net->ipv6.fib6_null_entry;
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792 const struct in6_addr *gwaddr)
794 struct net *net = dev_net(dev);
795 struct route_info *rinfo = (struct route_info *) opt;
796 struct in6_addr prefix_buf, *prefix;
798 unsigned long lifetime;
799 struct fib6_info *rt;
801 if (len < sizeof(struct route_info)) {
805 /* Sanity check for prefix_len and length */
806 if (rinfo->length > 3) {
808 } else if (rinfo->prefix_len > 128) {
810 } else if (rinfo->prefix_len > 64) {
811 if (rinfo->length < 2) {
814 } else if (rinfo->prefix_len > 0) {
815 if (rinfo->length < 1) {
820 pref = rinfo->route_pref;
821 if (pref == ICMPV6_ROUTER_PREF_INVALID)
824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 if (rinfo->length == 3)
827 prefix = (struct in6_addr *)rinfo->prefix;
829 /* this function is safe */
830 ipv6_addr_prefix(&prefix_buf,
831 (struct in6_addr *)rinfo->prefix,
833 prefix = &prefix_buf;
836 if (rinfo->prefix_len == 0)
837 rt = rt6_get_dflt_router(net, gwaddr, dev);
839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
842 if (rt && !lifetime) {
848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
851 rt->fib6_flags = RTF_ROUTEINFO |
852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
855 if (!addrconf_finite_timeout(lifetime))
856 fib6_clean_expires(rt);
858 fib6_set_expires(rt, jiffies + HZ * lifetime);
860 fib6_info_release(rt);
867 * Misc support functions
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 struct net_device *dev = rt->fib6_nh.nh_dev;
875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876 /* for copies of local routes, dst->dev needs to be the
877 * device if it is a master device, the master device if
878 * device is enslaved, and the loopback as the default
880 if (netif_is_l3_slave(dev) &&
881 !rt6_need_strict(&rt->fib6_dst.addr))
882 dev = l3mdev_master_dev_rcu(dev);
883 else if (!netif_is_l3_master(dev))
884 dev = dev_net(dev)->loopback_dev;
885 /* last case is netif_is_l3_master(dev) is true in which
886 * case we want dev returned to be dev
893 static const int fib6_prop[RTN_MAX + 1] = {
900 [RTN_BLACKHOLE] = -EINVAL,
901 [RTN_UNREACHABLE] = -EHOSTUNREACH,
902 [RTN_PROHIBIT] = -EACCES,
903 [RTN_THROW] = -EAGAIN,
905 [RTN_XRESOLVE] = -EINVAL,
908 static int ip6_rt_type_to_error(u8 fib6_type)
910 return fib6_prop[fib6_type];
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 unsigned short flags = 0;
918 flags |= DST_NOCOUNT;
919 if (rt->dst_nopolicy)
920 flags |= DST_NOPOLICY;
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 switch (ort->fib6_type) {
933 rt->dst.output = dst_discard_out;
934 rt->dst.input = dst_discard;
937 rt->dst.output = ip6_pkt_prohibit_out;
938 rt->dst.input = ip6_pkt_prohibit;
941 case RTN_UNREACHABLE:
943 rt->dst.output = ip6_pkt_discard_out;
944 rt->dst.input = ip6_pkt_discard;
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 if (from->fib6_metrics != &dst_default_metrics) {
982 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983 refcount_inc(&from->fib6_metrics->refcnt);
987 /* Caller must already hold reference to @ort */
988 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
990 struct net_device *dev = fib6_info_nh_dev(ort);
992 ip6_rt_init_dst(rt, ort);
994 rt->rt6i_dst = ort->fib6_dst;
995 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
996 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
997 rt->rt6i_flags = ort->fib6_flags;
998 rt6_set_from(rt, ort);
999 #ifdef CONFIG_IPV6_SUBTREES
1000 rt->rt6i_src = ort->fib6_src;
1002 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006 struct in6_addr *saddr)
1008 struct fib6_node *pn, *sn;
1010 if (fn->fn_flags & RTN_TL_ROOT)
1012 pn = rcu_dereference(fn->parent);
1013 sn = FIB6_SUBTREE(pn);
1015 fn = fib6_node_lookup(sn, NULL, saddr);
1018 if (fn->fn_flags & RTN_RTINFO)
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1026 struct rt6_info *rt = *prt;
1028 if (dst_hold_safe(&rt->dst))
1030 if (null_fallback) {
1031 rt = net->ipv6.ip6_null_entry;
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043 unsigned short flags = fib6_info_dst_flags(rt);
1044 struct net_device *dev = rt->fib6_nh.nh_dev;
1045 struct rt6_info *nrt;
1047 if (!fib6_info_hold_safe(rt))
1050 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1052 fib6_info_release(rt);
1056 ip6_rt_copy_init(nrt, rt);
1060 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1061 dst_hold(&nrt->dst);
1065 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1066 struct fib6_table *table,
1068 const struct sk_buff *skb,
1071 struct fib6_info *f6i;
1072 struct fib6_node *fn;
1073 struct rt6_info *rt;
1075 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1076 flags &= ~RT6_LOOKUP_F_IFACE;
1079 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1081 f6i = rcu_dereference(fn->leaf);
1083 f6i = net->ipv6.fib6_null_entry;
1085 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1086 fl6->flowi6_oif, flags);
1087 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1088 f6i = fib6_multipath_select(net, f6i, fl6,
1089 fl6->flowi6_oif, skb,
1092 if (f6i == net->ipv6.fib6_null_entry) {
1093 fn = fib6_backtrack(fn, &fl6->saddr);
1098 trace_fib6_table_lookup(net, f6i, table, fl6);
1100 /* Search through exception table */
1101 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1103 if (ip6_hold_safe(net, &rt, true))
1104 dst_use_noref(&rt->dst, jiffies);
1105 } else if (f6i == net->ipv6.fib6_null_entry) {
1106 rt = net->ipv6.ip6_null_entry;
1109 rt = ip6_create_rt_rcu(f6i);
1117 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1118 const struct sk_buff *skb, int flags)
1120 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1122 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1124 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1125 const struct in6_addr *saddr, int oif,
1126 const struct sk_buff *skb, int strict)
1128 struct flowi6 fl6 = {
1132 struct dst_entry *dst;
1133 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1136 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1137 flags |= RT6_LOOKUP_F_HAS_SADDR;
1140 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1141 if (dst->error == 0)
1142 return (struct rt6_info *) dst;
1148 EXPORT_SYMBOL(rt6_lookup);
1150 /* ip6_ins_rt is called with FREE table->tb6_lock.
1151 * It takes new route entry, the addition fails by any reason the
1152 * route is released.
1153 * Caller must hold dst before calling it.
1156 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1157 struct netlink_ext_ack *extack)
1160 struct fib6_table *table;
1162 table = rt->fib6_table;
1163 spin_lock_bh(&table->tb6_lock);
1164 err = fib6_add(&table->tb6_root, rt, info, extack);
1165 spin_unlock_bh(&table->tb6_lock);
1170 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1172 struct nl_info info = { .nl_net = net, };
1174 return __ip6_ins_rt(rt, &info, NULL);
1177 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1178 const struct in6_addr *daddr,
1179 const struct in6_addr *saddr)
1181 struct net_device *dev;
1182 struct rt6_info *rt;
1188 if (!fib6_info_hold_safe(ort))
1191 dev = ip6_rt_get_dev_rcu(ort);
1192 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1194 fib6_info_release(ort);
1198 ip6_rt_copy_init(rt, ort);
1199 rt->rt6i_flags |= RTF_CACHE;
1200 rt->dst.flags |= DST_HOST;
1201 rt->rt6i_dst.addr = *daddr;
1202 rt->rt6i_dst.plen = 128;
1204 if (!rt6_is_gw_or_nonexthop(ort)) {
1205 if (ort->fib6_dst.plen != 128 &&
1206 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1207 rt->rt6i_flags |= RTF_ANYCAST;
1208 #ifdef CONFIG_IPV6_SUBTREES
1209 if (rt->rt6i_src.plen && saddr) {
1210 rt->rt6i_src.addr = *saddr;
1211 rt->rt6i_src.plen = 128;
1219 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1221 unsigned short flags = fib6_info_dst_flags(rt);
1222 struct net_device *dev;
1223 struct rt6_info *pcpu_rt;
1225 if (!fib6_info_hold_safe(rt))
1229 dev = ip6_rt_get_dev_rcu(rt);
1230 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1233 fib6_info_release(rt);
1236 ip6_rt_copy_init(pcpu_rt, rt);
1237 pcpu_rt->rt6i_flags |= RTF_PCPU;
1241 /* It should be called with rcu_read_lock() acquired */
1242 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1244 struct rt6_info *pcpu_rt, **p;
1246 p = this_cpu_ptr(rt->rt6i_pcpu);
1250 ip6_hold_safe(NULL, &pcpu_rt, false);
1255 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1256 struct fib6_info *rt)
1258 struct rt6_info *pcpu_rt, *prev, **p;
1260 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1262 dst_hold(&net->ipv6.ip6_null_entry->dst);
1263 return net->ipv6.ip6_null_entry;
1266 dst_hold(&pcpu_rt->dst);
1267 p = this_cpu_ptr(rt->rt6i_pcpu);
1268 prev = cmpxchg(p, NULL, pcpu_rt);
1274 /* exception hash table implementation
1276 static DEFINE_SPINLOCK(rt6_exception_lock);
1278 /* Remove rt6_ex from hash table and free the memory
1279 * Caller must hold rt6_exception_lock
1281 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1282 struct rt6_exception *rt6_ex)
1284 struct fib6_info *from;
1287 if (!bucket || !rt6_ex)
1290 net = dev_net(rt6_ex->rt6i->dst.dev);
1291 net->ipv6.rt6_stats->fib_rt_cache--;
1293 /* purge completely the exception to allow releasing the held resources:
1294 * some [sk] cache may keep the dst around for unlimited time
1296 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1297 fib6_info_release(from);
1298 dst_dev_put(&rt6_ex->rt6i->dst);
1300 hlist_del_rcu(&rt6_ex->hlist);
1301 dst_release(&rt6_ex->rt6i->dst);
1302 kfree_rcu(rt6_ex, rcu);
1303 WARN_ON_ONCE(!bucket->depth);
1307 /* Remove oldest rt6_ex in bucket and free the memory
1308 * Caller must hold rt6_exception_lock
1310 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1312 struct rt6_exception *rt6_ex, *oldest = NULL;
1317 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1318 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1321 rt6_remove_exception(bucket, oldest);
1324 static u32 rt6_exception_hash(const struct in6_addr *dst,
1325 const struct in6_addr *src)
1327 static u32 seed __read_mostly;
1330 net_get_random_once(&seed, sizeof(seed));
1331 val = jhash(dst, sizeof(*dst), seed);
1333 #ifdef CONFIG_IPV6_SUBTREES
1335 val = jhash(src, sizeof(*src), val);
1337 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1340 /* Helper function to find the cached rt in the hash table
1341 * and update bucket pointer to point to the bucket for this
1342 * (daddr, saddr) pair
1343 * Caller must hold rt6_exception_lock
1345 static struct rt6_exception *
1346 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1347 const struct in6_addr *daddr,
1348 const struct in6_addr *saddr)
1350 struct rt6_exception *rt6_ex;
1353 if (!(*bucket) || !daddr)
1356 hval = rt6_exception_hash(daddr, saddr);
1359 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1360 struct rt6_info *rt6 = rt6_ex->rt6i;
1361 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1363 #ifdef CONFIG_IPV6_SUBTREES
1364 if (matched && saddr)
1365 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1373 /* Helper function to find the cached rt in the hash table
1374 * and update bucket pointer to point to the bucket for this
1375 * (daddr, saddr) pair
1376 * Caller must hold rcu_read_lock()
1378 static struct rt6_exception *
1379 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1380 const struct in6_addr *daddr,
1381 const struct in6_addr *saddr)
1383 struct rt6_exception *rt6_ex;
1386 WARN_ON_ONCE(!rcu_read_lock_held());
1388 if (!(*bucket) || !daddr)
1391 hval = rt6_exception_hash(daddr, saddr);
1394 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1395 struct rt6_info *rt6 = rt6_ex->rt6i;
1396 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1398 #ifdef CONFIG_IPV6_SUBTREES
1399 if (matched && saddr)
1400 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1408 static unsigned int fib6_mtu(const struct fib6_info *rt)
1412 if (rt->fib6_pmtu) {
1413 mtu = rt->fib6_pmtu;
1415 struct net_device *dev = fib6_info_nh_dev(rt);
1416 struct inet6_dev *idev;
1419 idev = __in6_dev_get(dev);
1420 mtu = idev->cnf.mtu6;
1424 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1426 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1429 static int rt6_insert_exception(struct rt6_info *nrt,
1430 struct fib6_info *ort)
1432 struct net *net = dev_net(nrt->dst.dev);
1433 struct rt6_exception_bucket *bucket;
1434 struct in6_addr *src_key = NULL;
1435 struct rt6_exception *rt6_ex;
1438 spin_lock_bh(&rt6_exception_lock);
1440 if (ort->exception_bucket_flushed) {
1445 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1446 lockdep_is_held(&rt6_exception_lock));
1448 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1454 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1457 #ifdef CONFIG_IPV6_SUBTREES
1458 /* rt6i_src.plen != 0 indicates ort is in subtree
1459 * and exception table is indexed by a hash of
1460 * both rt6i_dst and rt6i_src.
1461 * Otherwise, the exception table is indexed by
1462 * a hash of only rt6i_dst.
1464 if (ort->fib6_src.plen)
1465 src_key = &nrt->rt6i_src.addr;
1468 /* Update rt6i_prefsrc as it could be changed
1469 * in rt6_remove_prefsrc()
1471 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1472 /* rt6_mtu_change() might lower mtu on ort.
1473 * Only insert this exception route if its mtu
1474 * is less than ort's mtu value.
1476 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1481 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1484 rt6_remove_exception(bucket, rt6_ex);
1486 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1492 rt6_ex->stamp = jiffies;
1493 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1495 net->ipv6.rt6_stats->fib_rt_cache++;
1497 if (bucket->depth > FIB6_MAX_DEPTH)
1498 rt6_exception_remove_oldest(bucket);
1501 spin_unlock_bh(&rt6_exception_lock);
1503 /* Update fn->fn_sernum to invalidate all cached dst */
1505 spin_lock_bh(&ort->fib6_table->tb6_lock);
1506 fib6_update_sernum(net, ort);
1507 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1508 fib6_force_start_gc(net);
1514 void rt6_flush_exceptions(struct fib6_info *rt)
1516 struct rt6_exception_bucket *bucket;
1517 struct rt6_exception *rt6_ex;
1518 struct hlist_node *tmp;
1521 spin_lock_bh(&rt6_exception_lock);
1522 /* Prevent rt6_insert_exception() to recreate the bucket list */
1523 rt->exception_bucket_flushed = 1;
1525 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1526 lockdep_is_held(&rt6_exception_lock));
1530 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1531 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1532 rt6_remove_exception(bucket, rt6_ex);
1533 WARN_ON_ONCE(bucket->depth);
1538 spin_unlock_bh(&rt6_exception_lock);
1541 /* Find cached rt in the hash table inside passed in rt
1542 * Caller has to hold rcu_read_lock()
1544 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1545 const struct in6_addr *daddr,
1546 const struct in6_addr *saddr)
1548 const struct in6_addr *src_key = NULL;
1549 struct rt6_exception_bucket *bucket;
1550 struct rt6_exception *rt6_ex;
1551 struct rt6_info *res = NULL;
1553 #ifdef CONFIG_IPV6_SUBTREES
1554 /* rt6i_src.plen != 0 indicates rt is in subtree
1555 * and exception table is indexed by a hash of
1556 * both rt6i_dst and rt6i_src.
1557 * However, the src addr used to create the hash
1558 * might not be exactly the passed in saddr which
1559 * is a /128 addr from the flow.
1560 * So we need to use f6i->fib6_src to redo lookup
1561 * if the passed in saddr does not find anything.
1562 * (See the logic in ip6_rt_cache_alloc() on how
1563 * rt->rt6i_src is updated.)
1565 if (rt->fib6_src.plen)
1569 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1570 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1572 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1575 #ifdef CONFIG_IPV6_SUBTREES
1576 /* Use fib6_src as src_key and redo lookup */
1577 if (!res && src_key && src_key != &rt->fib6_src.addr) {
1578 src_key = &rt->fib6_src.addr;
1586 /* Remove the passed in cached rt from the hash table that contains it */
1587 static int rt6_remove_exception_rt(struct rt6_info *rt)
1589 struct rt6_exception_bucket *bucket;
1590 struct in6_addr *src_key = NULL;
1591 struct rt6_exception *rt6_ex;
1592 struct fib6_info *from;
1595 from = rcu_dereference(rt->from);
1597 !(rt->rt6i_flags & RTF_CACHE))
1600 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1603 spin_lock_bh(&rt6_exception_lock);
1604 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1605 lockdep_is_held(&rt6_exception_lock));
1606 #ifdef CONFIG_IPV6_SUBTREES
1607 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1608 * and exception table is indexed by a hash of
1609 * both rt6i_dst and rt6i_src.
1610 * Otherwise, the exception table is indexed by
1611 * a hash of only rt6i_dst.
1613 if (from->fib6_src.plen)
1614 src_key = &rt->rt6i_src.addr;
1616 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1620 rt6_remove_exception(bucket, rt6_ex);
1626 spin_unlock_bh(&rt6_exception_lock);
1630 /* Find rt6_ex which contains the passed in rt cache and
1633 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1635 struct rt6_exception_bucket *bucket;
1636 struct in6_addr *src_key = NULL;
1637 struct rt6_exception *rt6_ex;
1638 struct fib6_info *from;
1641 from = rcu_dereference(rt->from);
1642 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1645 bucket = rcu_dereference(from->rt6i_exception_bucket);
1647 #ifdef CONFIG_IPV6_SUBTREES
1648 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1649 * and exception table is indexed by a hash of
1650 * both rt6i_dst and rt6i_src.
1651 * Otherwise, the exception table is indexed by
1652 * a hash of only rt6i_dst.
1654 if (from->fib6_src.plen)
1655 src_key = &rt->rt6i_src.addr;
1657 rt6_ex = __rt6_find_exception_rcu(&bucket,
1661 rt6_ex->stamp = jiffies;
1667 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1669 struct rt6_exception_bucket *bucket;
1670 struct rt6_exception *rt6_ex;
1673 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1674 lockdep_is_held(&rt6_exception_lock));
1677 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1678 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1679 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1686 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1687 struct rt6_info *rt, int mtu)
1689 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1690 * lowest MTU in the path: always allow updating the route PMTU to
1691 * reflect PMTU decreases.
1693 * If the new MTU is higher, and the route PMTU is equal to the local
1694 * MTU, this means the old MTU is the lowest in the path, so allow
1695 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1699 if (dst_mtu(&rt->dst) >= mtu)
1702 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1708 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1709 struct fib6_info *rt, int mtu)
1711 struct rt6_exception_bucket *bucket;
1712 struct rt6_exception *rt6_ex;
1715 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1716 lockdep_is_held(&rt6_exception_lock));
1721 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1722 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1723 struct rt6_info *entry = rt6_ex->rt6i;
1725 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1726 * route), the metrics of its rt->from have already
1729 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1730 rt6_mtu_change_route_allowed(idev, entry, mtu))
1731 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1737 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1739 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1740 struct in6_addr *gateway)
1742 struct rt6_exception_bucket *bucket;
1743 struct rt6_exception *rt6_ex;
1744 struct hlist_node *tmp;
1747 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1750 spin_lock_bh(&rt6_exception_lock);
1751 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1752 lockdep_is_held(&rt6_exception_lock));
1755 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1756 hlist_for_each_entry_safe(rt6_ex, tmp,
1757 &bucket->chain, hlist) {
1758 struct rt6_info *entry = rt6_ex->rt6i;
1760 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1761 RTF_CACHE_GATEWAY &&
1762 ipv6_addr_equal(gateway,
1763 &entry->rt6i_gateway)) {
1764 rt6_remove_exception(bucket, rt6_ex);
1771 spin_unlock_bh(&rt6_exception_lock);
1774 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1775 struct rt6_exception *rt6_ex,
1776 struct fib6_gc_args *gc_args,
1779 struct rt6_info *rt = rt6_ex->rt6i;
1781 /* we are pruning and obsoleting aged-out and non gateway exceptions
1782 * even if others have still references to them, so that on next
1783 * dst_check() such references can be dropped.
1784 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1785 * expired, independently from their aging, as per RFC 8201 section 4
1787 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1788 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1789 RT6_TRACE("aging clone %p\n", rt);
1790 rt6_remove_exception(bucket, rt6_ex);
1793 } else if (time_after(jiffies, rt->dst.expires)) {
1794 RT6_TRACE("purging expired route %p\n", rt);
1795 rt6_remove_exception(bucket, rt6_ex);
1799 if (rt->rt6i_flags & RTF_GATEWAY) {
1800 struct neighbour *neigh;
1801 __u8 neigh_flags = 0;
1803 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1805 neigh_flags = neigh->flags;
1807 if (!(neigh_flags & NTF_ROUTER)) {
1808 RT6_TRACE("purging route %p via non-router but gateway\n",
1810 rt6_remove_exception(bucket, rt6_ex);
1818 void rt6_age_exceptions(struct fib6_info *rt,
1819 struct fib6_gc_args *gc_args,
1822 struct rt6_exception_bucket *bucket;
1823 struct rt6_exception *rt6_ex;
1824 struct hlist_node *tmp;
1827 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1831 spin_lock(&rt6_exception_lock);
1832 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1833 lockdep_is_held(&rt6_exception_lock));
1836 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1837 hlist_for_each_entry_safe(rt6_ex, tmp,
1838 &bucket->chain, hlist) {
1839 rt6_age_examine_exception(bucket, rt6_ex,
1845 spin_unlock(&rt6_exception_lock);
1846 rcu_read_unlock_bh();
1849 /* must be called with rcu lock held */
1850 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1851 int oif, struct flowi6 *fl6, int strict)
1853 struct fib6_node *fn, *saved_fn;
1854 struct fib6_info *f6i;
1856 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1859 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1863 f6i = rt6_select(net, fn, oif, strict);
1864 if (f6i == net->ipv6.fib6_null_entry) {
1865 fn = fib6_backtrack(fn, &fl6->saddr);
1867 goto redo_rt6_select;
1868 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1869 /* also consider unreachable route */
1870 strict &= ~RT6_LOOKUP_F_REACHABLE;
1872 goto redo_rt6_select;
1876 trace_fib6_table_lookup(net, f6i, table, fl6);
1881 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1882 int oif, struct flowi6 *fl6,
1883 const struct sk_buff *skb, int flags)
1885 struct fib6_info *f6i;
1886 struct rt6_info *rt;
1889 strict |= flags & RT6_LOOKUP_F_IFACE;
1890 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1891 if (net->ipv6.devconf_all->forwarding == 0)
1892 strict |= RT6_LOOKUP_F_REACHABLE;
1896 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1897 if (f6i->fib6_nsiblings)
1898 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1900 if (f6i == net->ipv6.fib6_null_entry) {
1901 rt = net->ipv6.ip6_null_entry;
1907 /*Search through exception table */
1908 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1910 if (ip6_hold_safe(net, &rt, true))
1911 dst_use_noref(&rt->dst, jiffies);
1915 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1916 !(f6i->fib6_flags & RTF_GATEWAY))) {
1917 /* Create a RTF_CACHE clone which will not be
1918 * owned by the fib6 tree. It is for the special case where
1919 * the daddr in the skb during the neighbor look-up is different
1920 * from the fl6->daddr used to look-up route here.
1922 struct rt6_info *uncached_rt;
1924 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1929 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1930 * No need for another dst_hold()
1932 rt6_uncached_list_add(uncached_rt);
1933 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1935 uncached_rt = net->ipv6.ip6_null_entry;
1936 dst_hold(&uncached_rt->dst);
1941 /* Get a percpu copy */
1943 struct rt6_info *pcpu_rt;
1946 pcpu_rt = rt6_get_pcpu_route(f6i);
1949 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1957 EXPORT_SYMBOL_GPL(ip6_pol_route);
1959 static struct rt6_info *ip6_pol_route_input(struct net *net,
1960 struct fib6_table *table,
1962 const struct sk_buff *skb,
1965 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1968 struct dst_entry *ip6_route_input_lookup(struct net *net,
1969 struct net_device *dev,
1971 const struct sk_buff *skb,
1974 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1975 flags |= RT6_LOOKUP_F_IFACE;
1977 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1979 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1981 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1982 struct flow_keys *keys,
1983 struct flow_keys *flkeys)
1985 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1986 const struct ipv6hdr *key_iph = outer_iph;
1987 struct flow_keys *_flkeys = flkeys;
1988 const struct ipv6hdr *inner_iph;
1989 const struct icmp6hdr *icmph;
1990 struct ipv6hdr _inner_iph;
1991 struct icmp6hdr _icmph;
1993 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1996 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1997 sizeof(_icmph), &_icmph);
2001 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2002 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2003 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2004 icmph->icmp6_type != ICMPV6_PARAMPROB)
2007 inner_iph = skb_header_pointer(skb,
2008 skb_transport_offset(skb) + sizeof(*icmph),
2009 sizeof(_inner_iph), &_inner_iph);
2013 key_iph = inner_iph;
2017 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2018 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2019 keys->tags.flow_label = _flkeys->tags.flow_label;
2020 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2022 keys->addrs.v6addrs.src = key_iph->saddr;
2023 keys->addrs.v6addrs.dst = key_iph->daddr;
2024 keys->tags.flow_label = ip6_flowlabel(key_iph);
2025 keys->basic.ip_proto = key_iph->nexthdr;
2029 /* if skb is set it will be used and fl6 can be NULL */
2030 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2031 const struct sk_buff *skb, struct flow_keys *flkeys)
2033 struct flow_keys hash_keys;
2036 switch (ip6_multipath_hash_policy(net)) {
2038 memset(&hash_keys, 0, sizeof(hash_keys));
2039 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2041 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2043 hash_keys.addrs.v6addrs.src = fl6->saddr;
2044 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2045 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2046 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2051 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2052 struct flow_keys keys;
2054 /* short-circuit if we already have L4 hash present */
2056 return skb_get_hash_raw(skb) >> 1;
2058 memset(&hash_keys, 0, sizeof(hash_keys));
2061 skb_flow_dissect_flow_keys(skb, &keys, flag);
2064 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2065 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2066 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2067 hash_keys.ports.src = flkeys->ports.src;
2068 hash_keys.ports.dst = flkeys->ports.dst;
2069 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2071 memset(&hash_keys, 0, sizeof(hash_keys));
2072 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2073 hash_keys.addrs.v6addrs.src = fl6->saddr;
2074 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2075 hash_keys.ports.src = fl6->fl6_sport;
2076 hash_keys.ports.dst = fl6->fl6_dport;
2077 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2081 mhash = flow_hash_from_keys(&hash_keys);
2086 void ip6_route_input(struct sk_buff *skb)
2088 const struct ipv6hdr *iph = ipv6_hdr(skb);
2089 struct net *net = dev_net(skb->dev);
2090 int flags = RT6_LOOKUP_F_HAS_SADDR;
2091 struct ip_tunnel_info *tun_info;
2092 struct flowi6 fl6 = {
2093 .flowi6_iif = skb->dev->ifindex,
2094 .daddr = iph->daddr,
2095 .saddr = iph->saddr,
2096 .flowlabel = ip6_flowinfo(iph),
2097 .flowi6_mark = skb->mark,
2098 .flowi6_proto = iph->nexthdr,
2100 struct flow_keys *flkeys = NULL, _flkeys;
2102 tun_info = skb_tunnel_info(skb);
2103 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2104 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2106 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2109 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2110 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2113 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2116 static struct rt6_info *ip6_pol_route_output(struct net *net,
2117 struct fib6_table *table,
2119 const struct sk_buff *skb,
2122 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2125 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2126 struct flowi6 *fl6, int flags)
2130 if (rt6_need_strict(&fl6->daddr)) {
2131 struct dst_entry *dst;
2133 dst = l3mdev_link_scope_lookup(net, fl6);
2138 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2140 any_src = ipv6_addr_any(&fl6->saddr);
2141 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2142 (fl6->flowi6_oif && any_src))
2143 flags |= RT6_LOOKUP_F_IFACE;
2146 flags |= RT6_LOOKUP_F_HAS_SADDR;
2148 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2150 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2152 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2154 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2156 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2157 struct net_device *loopback_dev = net->loopback_dev;
2158 struct dst_entry *new = NULL;
2160 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2161 DST_OBSOLETE_DEAD, 0);
2164 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2168 new->input = dst_discard;
2169 new->output = dst_discard_out;
2171 dst_copy_metrics(new, &ort->dst);
2173 rt->rt6i_idev = in6_dev_get(loopback_dev);
2174 rt->rt6i_gateway = ort->rt6i_gateway;
2175 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2177 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2178 #ifdef CONFIG_IPV6_SUBTREES
2179 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2183 dst_release(dst_orig);
2184 return new ? new : ERR_PTR(-ENOMEM);
2188 * Destination cache support functions
2191 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2195 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2198 if (fib6_check_expired(f6i))
2204 static struct dst_entry *rt6_check(struct rt6_info *rt,
2205 struct fib6_info *from,
2210 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2211 rt_cookie != cookie)
2214 if (rt6_check_expired(rt))
2220 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2221 struct fib6_info *from,
2224 if (!__rt6_check_expired(rt) &&
2225 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2226 fib6_check(from, cookie))
2232 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2234 struct dst_entry *dst_ret;
2235 struct fib6_info *from;
2236 struct rt6_info *rt;
2238 rt = container_of(dst, struct rt6_info, dst);
2242 /* All IPV6 dsts are created with ->obsolete set to the value
2243 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2244 * into this function always.
2247 from = rcu_dereference(rt->from);
2249 if (from && (rt->rt6i_flags & RTF_PCPU ||
2250 unlikely(!list_empty(&rt->rt6i_uncached))))
2251 dst_ret = rt6_dst_from_check(rt, from, cookie);
2253 dst_ret = rt6_check(rt, from, cookie);
2260 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2262 struct rt6_info *rt = (struct rt6_info *) dst;
2265 if (rt->rt6i_flags & RTF_CACHE) {
2267 if (rt6_check_expired(rt)) {
2268 rt6_remove_exception_rt(rt);
2280 static void ip6_link_failure(struct sk_buff *skb)
2282 struct rt6_info *rt;
2284 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2286 rt = (struct rt6_info *) skb_dst(skb);
2289 if (rt->rt6i_flags & RTF_CACHE) {
2290 rt6_remove_exception_rt(rt);
2292 struct fib6_info *from;
2293 struct fib6_node *fn;
2295 from = rcu_dereference(rt->from);
2297 fn = rcu_dereference(from->fib6_node);
2298 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2306 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2308 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2309 struct fib6_info *from;
2312 from = rcu_dereference(rt0->from);
2314 rt0->dst.expires = from->expires;
2318 dst_set_expires(&rt0->dst, timeout);
2319 rt0->rt6i_flags |= RTF_EXPIRES;
2322 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2324 struct net *net = dev_net(rt->dst.dev);
2326 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2327 rt->rt6i_flags |= RTF_MODIFIED;
2328 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2331 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2336 from_set = !!rcu_dereference(rt->from);
2339 return !(rt->rt6i_flags & RTF_CACHE) &&
2340 (rt->rt6i_flags & RTF_PCPU || from_set);
2343 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2344 const struct ipv6hdr *iph, u32 mtu)
2346 const struct in6_addr *daddr, *saddr;
2347 struct rt6_info *rt6 = (struct rt6_info *)dst;
2349 if (dst_metric_locked(dst, RTAX_MTU))
2353 daddr = &iph->daddr;
2354 saddr = &iph->saddr;
2356 daddr = &sk->sk_v6_daddr;
2357 saddr = &inet6_sk(sk)->saddr;
2362 dst_confirm_neigh(dst, daddr);
2363 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2364 if (mtu >= dst_mtu(dst))
2367 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2368 rt6_do_update_pmtu(rt6, mtu);
2369 /* update rt6_ex->stamp for cache */
2370 if (rt6->rt6i_flags & RTF_CACHE)
2371 rt6_update_exception_stamp_rt(rt6);
2373 struct fib6_info *from;
2374 struct rt6_info *nrt6;
2377 from = rcu_dereference(rt6->from);
2382 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2384 rt6_do_update_pmtu(nrt6, mtu);
2385 if (rt6_insert_exception(nrt6, from))
2386 dst_release_immediate(&nrt6->dst);
2392 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2393 struct sk_buff *skb, u32 mtu)
2395 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2398 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2399 int oif, u32 mark, kuid_t uid)
2401 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2402 struct dst_entry *dst;
2405 memset(&fl6, 0, sizeof(fl6));
2406 fl6.flowi6_oif = oif;
2407 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2408 fl6.daddr = iph->daddr;
2409 fl6.saddr = iph->saddr;
2410 fl6.flowlabel = ip6_flowinfo(iph);
2411 fl6.flowi6_uid = uid;
2413 dst = ip6_route_output(net, NULL, &fl6);
2415 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2418 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2420 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2422 int oif = sk->sk_bound_dev_if;
2423 struct dst_entry *dst;
2425 if (!oif && skb->dev)
2426 oif = l3mdev_master_ifindex(skb->dev);
2428 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2430 dst = __sk_dst_get(sk);
2431 if (!dst || !dst->obsolete ||
2432 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2436 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2437 ip6_datagram_dst_update(sk, false);
2440 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2442 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2443 const struct flowi6 *fl6)
2445 #ifdef CONFIG_IPV6_SUBTREES
2446 struct ipv6_pinfo *np = inet6_sk(sk);
2449 ip6_dst_store(sk, dst,
2450 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2451 &sk->sk_v6_daddr : NULL,
2452 #ifdef CONFIG_IPV6_SUBTREES
2453 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2459 /* Handle redirects */
2460 struct ip6rd_flowi {
2462 struct in6_addr gateway;
2465 static struct rt6_info *__ip6_route_redirect(struct net *net,
2466 struct fib6_table *table,
2468 const struct sk_buff *skb,
2471 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2472 struct rt6_info *ret = NULL, *rt_cache;
2473 struct fib6_info *rt;
2474 struct fib6_node *fn;
2476 /* Get the "current" route for this destination and
2477 * check if the redirect has come from appropriate router.
2479 * RFC 4861 specifies that redirects should only be
2480 * accepted if they come from the nexthop to the target.
2481 * Due to the way the routes are chosen, this notion
2482 * is a bit fuzzy and one might need to check all possible
2487 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2489 for_each_fib6_node_rt_rcu(fn) {
2490 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2492 if (fib6_check_expired(rt))
2494 if (rt->fib6_flags & RTF_REJECT)
2496 if (!(rt->fib6_flags & RTF_GATEWAY))
2498 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2500 /* rt_cache's gateway might be different from its 'parent'
2501 * in the case of an ip redirect.
2502 * So we keep searching in the exception table if the gateway
2505 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2506 rt_cache = rt6_find_cached_rt(rt,
2510 ipv6_addr_equal(&rdfl->gateway,
2511 &rt_cache->rt6i_gateway)) {
2521 rt = net->ipv6.fib6_null_entry;
2522 else if (rt->fib6_flags & RTF_REJECT) {
2523 ret = net->ipv6.ip6_null_entry;
2527 if (rt == net->ipv6.fib6_null_entry) {
2528 fn = fib6_backtrack(fn, &fl6->saddr);
2535 ip6_hold_safe(net, &ret, true);
2537 ret = ip6_create_rt_rcu(rt);
2541 trace_fib6_table_lookup(net, rt, table, fl6);
2545 static struct dst_entry *ip6_route_redirect(struct net *net,
2546 const struct flowi6 *fl6,
2547 const struct sk_buff *skb,
2548 const struct in6_addr *gateway)
2550 int flags = RT6_LOOKUP_F_HAS_SADDR;
2551 struct ip6rd_flowi rdfl;
2554 rdfl.gateway = *gateway;
2556 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2557 flags, __ip6_route_redirect);
2560 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2563 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2564 struct dst_entry *dst;
2567 memset(&fl6, 0, sizeof(fl6));
2568 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2569 fl6.flowi6_oif = oif;
2570 fl6.flowi6_mark = mark;
2571 fl6.daddr = iph->daddr;
2572 fl6.saddr = iph->saddr;
2573 fl6.flowlabel = ip6_flowinfo(iph);
2574 fl6.flowi6_uid = uid;
2576 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2577 rt6_do_redirect(dst, NULL, skb);
2580 EXPORT_SYMBOL_GPL(ip6_redirect);
2582 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2585 const struct ipv6hdr *iph = ipv6_hdr(skb);
2586 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2587 struct dst_entry *dst;
2590 memset(&fl6, 0, sizeof(fl6));
2591 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2592 fl6.flowi6_oif = oif;
2593 fl6.flowi6_mark = mark;
2594 fl6.daddr = msg->dest;
2595 fl6.saddr = iph->daddr;
2596 fl6.flowi6_uid = sock_net_uid(net, NULL);
2598 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2599 rt6_do_redirect(dst, NULL, skb);
2603 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2605 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2608 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2610 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2612 struct net_device *dev = dst->dev;
2613 unsigned int mtu = dst_mtu(dst);
2614 struct net *net = dev_net(dev);
2616 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2618 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2619 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2622 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2623 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2624 * IPV6_MAXPLEN is also valid and means: "any MSS,
2625 * rely only on pmtu discovery"
2627 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2632 static unsigned int ip6_mtu(const struct dst_entry *dst)
2634 struct inet6_dev *idev;
2637 mtu = dst_metric_raw(dst, RTAX_MTU);
2644 idev = __in6_dev_get(dst->dev);
2646 mtu = idev->cnf.mtu6;
2650 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2652 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2656 * 1. mtu on route is locked - use it
2657 * 2. mtu from nexthop exception
2658 * 3. mtu from egress device
2660 * based on ip6_dst_mtu_forward and exception logic of
2661 * rt6_find_cached_rt; called with rcu_read_lock
2663 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2664 struct in6_addr *saddr)
2666 struct inet6_dev *idev;
2667 struct rt6_info *rt;
2670 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2671 mtu = f6i->fib6_pmtu;
2676 rt = rt6_find_cached_rt(f6i, daddr, saddr);
2678 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2680 struct net_device *dev = fib6_info_nh_dev(f6i);
2683 idev = __in6_dev_get(dev);
2684 if (idev && idev->cnf.mtu6 > mtu)
2685 mtu = idev->cnf.mtu6;
2688 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2690 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2693 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2696 struct dst_entry *dst;
2697 struct rt6_info *rt;
2698 struct inet6_dev *idev = in6_dev_get(dev);
2699 struct net *net = dev_net(dev);
2701 if (unlikely(!idev))
2702 return ERR_PTR(-ENODEV);
2704 rt = ip6_dst_alloc(net, dev, 0);
2705 if (unlikely(!rt)) {
2707 dst = ERR_PTR(-ENOMEM);
2711 rt->dst.flags |= DST_HOST;
2712 rt->dst.input = ip6_input;
2713 rt->dst.output = ip6_output;
2714 rt->rt6i_gateway = fl6->daddr;
2715 rt->rt6i_dst.addr = fl6->daddr;
2716 rt->rt6i_dst.plen = 128;
2717 rt->rt6i_idev = idev;
2718 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2720 /* Add this dst into uncached_list so that rt6_disable_ip() can
2721 * do proper release of the net_device
2723 rt6_uncached_list_add(rt);
2724 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2726 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2732 static int ip6_dst_gc(struct dst_ops *ops)
2734 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2735 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2736 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2737 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2738 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2739 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2742 entries = dst_entries_get_fast(ops);
2743 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2744 entries <= rt_max_size)
2747 net->ipv6.ip6_rt_gc_expire++;
2748 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2749 entries = dst_entries_get_slow(ops);
2750 if (entries < ops->gc_thresh)
2751 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2753 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2754 return entries > rt_max_size;
2757 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2758 struct fib6_config *cfg)
2760 struct dst_metrics *p;
2765 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2769 refcount_set(&p->refcnt, 1);
2770 rt->fib6_metrics = p;
2772 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2775 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2776 struct fib6_config *cfg,
2777 const struct in6_addr *gw_addr,
2778 u32 tbid, int flags)
2780 struct flowi6 fl6 = {
2781 .flowi6_oif = cfg->fc_ifindex,
2783 .saddr = cfg->fc_prefsrc,
2785 struct fib6_table *table;
2786 struct rt6_info *rt;
2788 table = fib6_get_table(net, tbid);
2792 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2793 flags |= RT6_LOOKUP_F_HAS_SADDR;
2795 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2796 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2798 /* if table lookup failed, fall back to full lookup */
2799 if (rt == net->ipv6.ip6_null_entry) {
2807 static int ip6_route_check_nh_onlink(struct net *net,
2808 struct fib6_config *cfg,
2809 const struct net_device *dev,
2810 struct netlink_ext_ack *extack)
2812 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2813 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2814 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2815 struct fib6_info *from;
2816 struct rt6_info *grt;
2820 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2823 from = rcu_dereference(grt->from);
2824 if (!grt->dst.error &&
2825 /* ignore match if it is the default route */
2826 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2827 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2828 NL_SET_ERR_MSG(extack,
2829 "Nexthop has invalid gateway or device mismatch");
2840 static int ip6_route_check_nh(struct net *net,
2841 struct fib6_config *cfg,
2842 struct net_device **_dev,
2843 struct inet6_dev **idev)
2845 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2846 struct net_device *dev = _dev ? *_dev : NULL;
2847 struct rt6_info *grt = NULL;
2848 int err = -EHOSTUNREACH;
2850 if (cfg->fc_table) {
2851 int flags = RT6_LOOKUP_F_IFACE;
2853 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2854 cfg->fc_table, flags);
2856 if (grt->rt6i_flags & RTF_GATEWAY ||
2857 (dev && dev != grt->dst.dev)) {
2865 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2871 if (dev != grt->dst.dev) {
2876 *_dev = dev = grt->dst.dev;
2877 *idev = grt->rt6i_idev;
2879 in6_dev_hold(grt->rt6i_idev);
2882 if (!(grt->rt6i_flags & RTF_GATEWAY))
2891 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2892 struct net_device **_dev, struct inet6_dev **idev,
2893 struct netlink_ext_ack *extack)
2895 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2896 int gwa_type = ipv6_addr_type(gw_addr);
2897 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2898 const struct net_device *dev = *_dev;
2899 bool need_addr_check = !dev;
2902 /* if gw_addr is local we will fail to detect this in case
2903 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2904 * will return already-added prefix route via interface that
2905 * prefix route was assigned to, which might be non-loopback.
2908 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2909 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2913 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2914 /* IPv6 strictly inhibits using not link-local
2915 * addresses as nexthop address.
2916 * Otherwise, router will not able to send redirects.
2917 * It is very good, but in some (rare!) circumstances
2918 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2919 * some exceptions. --ANK
2920 * We allow IPv4-mapped nexthops to support RFC4798-type
2923 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2924 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2928 if (cfg->fc_flags & RTNH_F_ONLINK)
2929 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2931 err = ip6_route_check_nh(net, cfg, _dev, idev);
2937 /* reload in case device was changed */
2942 NL_SET_ERR_MSG(extack, "Egress device not specified");
2944 } else if (dev->flags & IFF_LOOPBACK) {
2945 NL_SET_ERR_MSG(extack,
2946 "Egress device can not be loopback device for this route");
2950 /* if we did not check gw_addr above, do so now that the
2951 * egress device has been resolved.
2953 if (need_addr_check &&
2954 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2955 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2964 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2966 struct netlink_ext_ack *extack)
2968 struct net *net = cfg->fc_nlinfo.nl_net;
2969 struct fib6_info *rt = NULL;
2970 struct net_device *dev = NULL;
2971 struct inet6_dev *idev = NULL;
2972 struct fib6_table *table;
2976 /* RTF_PCPU is an internal flag; can not be set by userspace */
2977 if (cfg->fc_flags & RTF_PCPU) {
2978 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2982 /* RTF_CACHE is an internal flag; can not be set by userspace */
2983 if (cfg->fc_flags & RTF_CACHE) {
2984 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2988 if (cfg->fc_type > RTN_MAX) {
2989 NL_SET_ERR_MSG(extack, "Invalid route type");
2993 if (cfg->fc_dst_len > 128) {
2994 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2997 if (cfg->fc_src_len > 128) {
2998 NL_SET_ERR_MSG(extack, "Invalid source address length");
3001 #ifndef CONFIG_IPV6_SUBTREES
3002 if (cfg->fc_src_len) {
3003 NL_SET_ERR_MSG(extack,
3004 "Specifying source address requires IPV6_SUBTREES to be enabled");
3008 if (cfg->fc_ifindex) {
3010 dev = dev_get_by_index(net, cfg->fc_ifindex);
3013 idev = in6_dev_get(dev);
3018 if (cfg->fc_metric == 0)
3019 cfg->fc_metric = IP6_RT_PRIO_USER;
3021 if (cfg->fc_flags & RTNH_F_ONLINK) {
3023 NL_SET_ERR_MSG(extack,
3024 "Nexthop device required for onlink");
3029 if (!(dev->flags & IFF_UP)) {
3030 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3037 if (cfg->fc_nlinfo.nlh &&
3038 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3039 table = fib6_get_table(net, cfg->fc_table);
3041 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3042 table = fib6_new_table(net, cfg->fc_table);
3045 table = fib6_new_table(net, cfg->fc_table);
3052 rt = fib6_info_alloc(gfp_flags);
3056 if (cfg->fc_flags & RTF_ADDRCONF)
3057 rt->dst_nocount = true;
3059 err = ip6_convert_metrics(net, rt, cfg);
3063 if (cfg->fc_flags & RTF_EXPIRES)
3064 fib6_set_expires(rt, jiffies +
3065 clock_t_to_jiffies(cfg->fc_expires));
3067 fib6_clean_expires(rt);
3069 if (cfg->fc_protocol == RTPROT_UNSPEC)
3070 cfg->fc_protocol = RTPROT_BOOT;
3071 rt->fib6_protocol = cfg->fc_protocol;
3073 addr_type = ipv6_addr_type(&cfg->fc_dst);
3075 if (cfg->fc_encap) {
3076 struct lwtunnel_state *lwtstate;
3078 err = lwtunnel_build_state(cfg->fc_encap_type,
3079 cfg->fc_encap, AF_INET6, cfg,
3083 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3086 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3087 rt->fib6_dst.plen = cfg->fc_dst_len;
3088 if (rt->fib6_dst.plen == 128)
3089 rt->dst_host = true;
3091 #ifdef CONFIG_IPV6_SUBTREES
3092 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3093 rt->fib6_src.plen = cfg->fc_src_len;
3096 rt->fib6_metric = cfg->fc_metric;
3097 rt->fib6_nh.nh_weight = 1;
3099 rt->fib6_type = cfg->fc_type;
3101 /* We cannot add true routes via loopback here,
3102 they would result in kernel looping; promote them to reject routes
3104 if ((cfg->fc_flags & RTF_REJECT) ||
3105 (dev && (dev->flags & IFF_LOOPBACK) &&
3106 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3107 !(cfg->fc_flags & RTF_LOCAL))) {
3108 /* hold loopback dev/idev if we haven't done so. */
3109 if (dev != net->loopback_dev) {
3114 dev = net->loopback_dev;
3116 idev = in6_dev_get(dev);
3122 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3126 if (cfg->fc_flags & RTF_GATEWAY) {
3127 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3131 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3138 if (idev->cnf.disable_ipv6) {
3139 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3144 if (!(dev->flags & IFF_UP)) {
3145 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3150 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3151 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3152 NL_SET_ERR_MSG(extack, "Invalid source address");
3156 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3157 rt->fib6_prefsrc.plen = 128;
3159 rt->fib6_prefsrc.plen = 0;
3161 rt->fib6_flags = cfg->fc_flags;
3164 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3165 !netif_carrier_ok(dev))
3166 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3167 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3168 rt->fib6_nh.nh_dev = dev;
3169 rt->fib6_table = table;
3171 cfg->fc_nlinfo.nl_net = dev_net(dev);
3183 fib6_info_release(rt);
3184 return ERR_PTR(err);
3187 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3188 struct netlink_ext_ack *extack)
3190 struct fib6_info *rt;
3193 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3197 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3198 fib6_info_release(rt);
3203 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3205 struct net *net = info->nl_net;
3206 struct fib6_table *table;
3209 if (rt == net->ipv6.fib6_null_entry) {
3214 table = rt->fib6_table;
3215 spin_lock_bh(&table->tb6_lock);
3216 err = fib6_del(rt, info);
3217 spin_unlock_bh(&table->tb6_lock);
3220 fib6_info_release(rt);
3224 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3226 struct nl_info info = { .nl_net = net };
3228 return __ip6_del_rt(rt, &info);
3231 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3233 struct nl_info *info = &cfg->fc_nlinfo;
3234 struct net *net = info->nl_net;
3235 struct sk_buff *skb = NULL;
3236 struct fib6_table *table;
3239 if (rt == net->ipv6.fib6_null_entry)
3241 table = rt->fib6_table;
3242 spin_lock_bh(&table->tb6_lock);
3244 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3245 struct fib6_info *sibling, *next_sibling;
3247 /* prefer to send a single notification with all hops */
3248 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3250 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3252 if (rt6_fill_node(net, skb, rt, NULL,
3253 NULL, NULL, 0, RTM_DELROUTE,
3254 info->portid, seq, 0) < 0) {
3258 info->skip_notify = 1;
3261 list_for_each_entry_safe(sibling, next_sibling,
3264 err = fib6_del(sibling, info);
3270 err = fib6_del(rt, info);
3272 spin_unlock_bh(&table->tb6_lock);
3274 fib6_info_release(rt);
3277 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3278 info->nlh, gfp_any());
3283 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3287 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3290 if (cfg->fc_flags & RTF_GATEWAY &&
3291 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3294 rc = rt6_remove_exception_rt(rt);
3299 static int ip6_route_del(struct fib6_config *cfg,
3300 struct netlink_ext_ack *extack)
3302 struct rt6_info *rt_cache;
3303 struct fib6_table *table;
3304 struct fib6_info *rt;
3305 struct fib6_node *fn;
3308 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3310 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3316 fn = fib6_locate(&table->tb6_root,
3317 &cfg->fc_dst, cfg->fc_dst_len,
3318 &cfg->fc_src, cfg->fc_src_len,
3319 !(cfg->fc_flags & RTF_CACHE));
3322 for_each_fib6_node_rt_rcu(fn) {
3323 if (cfg->fc_flags & RTF_CACHE) {
3326 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3329 rc = ip6_del_cached_rt(rt_cache, cfg);
3337 if (cfg->fc_ifindex &&
3338 (!rt->fib6_nh.nh_dev ||
3339 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3341 if (cfg->fc_flags & RTF_GATEWAY &&
3342 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3344 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3346 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3348 if (!fib6_info_hold_safe(rt))
3352 /* if gateway was specified only delete the one hop */
3353 if (cfg->fc_flags & RTF_GATEWAY)
3354 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3356 return __ip6_del_rt_siblings(rt, cfg);
3364 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3366 struct netevent_redirect netevent;
3367 struct rt6_info *rt, *nrt = NULL;
3368 struct ndisc_options ndopts;
3369 struct inet6_dev *in6_dev;
3370 struct neighbour *neigh;
3371 struct fib6_info *from;
3373 int optlen, on_link;
3376 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3377 optlen -= sizeof(*msg);
3380 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3384 msg = (struct rd_msg *)icmp6_hdr(skb);
3386 if (ipv6_addr_is_multicast(&msg->dest)) {
3387 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3392 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3394 } else if (ipv6_addr_type(&msg->target) !=
3395 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3396 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3400 in6_dev = __in6_dev_get(skb->dev);
3403 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3407 * The IP source address of the Redirect MUST be the same as the current
3408 * first-hop router for the specified ICMP Destination Address.
3411 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3412 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3417 if (ndopts.nd_opts_tgt_lladdr) {
3418 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3421 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3426 rt = (struct rt6_info *) dst;
3427 if (rt->rt6i_flags & RTF_REJECT) {
3428 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3432 /* Redirect received -> path was valid.
3433 * Look, redirects are sent only in response to data packets,
3434 * so that this nexthop apparently is reachable. --ANK
3436 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3438 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3443 * We have finally decided to accept it.
3446 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3447 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3448 NEIGH_UPDATE_F_OVERRIDE|
3449 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3450 NEIGH_UPDATE_F_ISROUTER)),
3451 NDISC_REDIRECT, &ndopts);
3454 from = rcu_dereference(rt->from);
3458 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3462 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3464 nrt->rt6i_flags &= ~RTF_GATEWAY;
3466 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3468 /* rt6_insert_exception() will take care of duplicated exceptions */
3469 if (rt6_insert_exception(nrt, from)) {
3470 dst_release_immediate(&nrt->dst);
3474 netevent.old = &rt->dst;
3475 netevent.new = &nrt->dst;
3476 netevent.daddr = &msg->dest;
3477 netevent.neigh = neigh;
3478 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3482 neigh_release(neigh);
3485 #ifdef CONFIG_IPV6_ROUTE_INFO
3486 static struct fib6_info *rt6_get_route_info(struct net *net,
3487 const struct in6_addr *prefix, int prefixlen,
3488 const struct in6_addr *gwaddr,
3489 struct net_device *dev)
3491 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3492 int ifindex = dev->ifindex;
3493 struct fib6_node *fn;
3494 struct fib6_info *rt = NULL;
3495 struct fib6_table *table;
3497 table = fib6_get_table(net, tb_id);
3502 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3506 for_each_fib6_node_rt_rcu(fn) {
3507 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3509 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3511 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3513 if (!fib6_info_hold_safe(rt))
3522 static struct fib6_info *rt6_add_route_info(struct net *net,
3523 const struct in6_addr *prefix, int prefixlen,
3524 const struct in6_addr *gwaddr,
3525 struct net_device *dev,
3528 struct fib6_config cfg = {
3529 .fc_metric = IP6_RT_PRIO_USER,
3530 .fc_ifindex = dev->ifindex,
3531 .fc_dst_len = prefixlen,
3532 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3533 RTF_UP | RTF_PREF(pref),
3534 .fc_protocol = RTPROT_RA,
3535 .fc_type = RTN_UNICAST,
3536 .fc_nlinfo.portid = 0,
3537 .fc_nlinfo.nlh = NULL,
3538 .fc_nlinfo.nl_net = net,
3541 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3542 cfg.fc_dst = *prefix;
3543 cfg.fc_gateway = *gwaddr;
3545 /* We should treat it as a default route if prefix length is 0. */
3547 cfg.fc_flags |= RTF_DEFAULT;
3549 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3551 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3555 struct fib6_info *rt6_get_dflt_router(struct net *net,
3556 const struct in6_addr *addr,
3557 struct net_device *dev)
3559 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3560 struct fib6_info *rt;
3561 struct fib6_table *table;
3563 table = fib6_get_table(net, tb_id);
3568 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3569 if (dev == rt->fib6_nh.nh_dev &&
3570 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3571 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3574 if (rt && !fib6_info_hold_safe(rt))
3580 struct fib6_info *rt6_add_dflt_router(struct net *net,
3581 const struct in6_addr *gwaddr,
3582 struct net_device *dev,
3585 struct fib6_config cfg = {
3586 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3587 .fc_metric = IP6_RT_PRIO_USER,
3588 .fc_ifindex = dev->ifindex,
3589 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3590 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3591 .fc_protocol = RTPROT_RA,
3592 .fc_type = RTN_UNICAST,
3593 .fc_nlinfo.portid = 0,
3594 .fc_nlinfo.nlh = NULL,
3595 .fc_nlinfo.nl_net = net,
3598 cfg.fc_gateway = *gwaddr;
3600 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3601 struct fib6_table *table;
3603 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3605 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3608 return rt6_get_dflt_router(net, gwaddr, dev);
3611 static void __rt6_purge_dflt_routers(struct net *net,
3612 struct fib6_table *table)
3614 struct fib6_info *rt;
3618 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3619 struct net_device *dev = fib6_info_nh_dev(rt);
3620 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3622 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3623 (!idev || idev->cnf.accept_ra != 2) &&
3624 fib6_info_hold_safe(rt)) {
3626 ip6_del_rt(net, rt);
3632 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3635 void rt6_purge_dflt_routers(struct net *net)
3637 struct fib6_table *table;
3638 struct hlist_head *head;
3643 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3644 head = &net->ipv6.fib_table_hash[h];
3645 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3646 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3647 __rt6_purge_dflt_routers(net, table);
3654 static void rtmsg_to_fib6_config(struct net *net,
3655 struct in6_rtmsg *rtmsg,
3656 struct fib6_config *cfg)
3658 memset(cfg, 0, sizeof(*cfg));
3660 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3662 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3663 cfg->fc_metric = rtmsg->rtmsg_metric;
3664 cfg->fc_expires = rtmsg->rtmsg_info;
3665 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3666 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3667 cfg->fc_flags = rtmsg->rtmsg_flags;
3668 cfg->fc_type = rtmsg->rtmsg_type;
3670 cfg->fc_nlinfo.nl_net = net;
3672 cfg->fc_dst = rtmsg->rtmsg_dst;
3673 cfg->fc_src = rtmsg->rtmsg_src;
3674 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3677 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3679 struct fib6_config cfg;
3680 struct in6_rtmsg rtmsg;
3684 case SIOCADDRT: /* Add a route */
3685 case SIOCDELRT: /* Delete a route */
3686 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3688 err = copy_from_user(&rtmsg, arg,
3689 sizeof(struct in6_rtmsg));
3693 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3698 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3701 err = ip6_route_del(&cfg, NULL);
3715 * Drop the packet on the floor
3718 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3721 struct dst_entry *dst = skb_dst(skb);
3722 switch (ipstats_mib_noroutes) {
3723 case IPSTATS_MIB_INNOROUTES:
3724 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3725 if (type == IPV6_ADDR_ANY) {
3726 IP6_INC_STATS(dev_net(dst->dev),
3727 __in6_dev_get_safely(skb->dev),
3728 IPSTATS_MIB_INADDRERRORS);
3732 case IPSTATS_MIB_OUTNOROUTES:
3733 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3734 ipstats_mib_noroutes);
3737 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3742 static int ip6_pkt_discard(struct sk_buff *skb)
3744 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3747 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3749 skb->dev = skb_dst(skb)->dev;
3750 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3753 static int ip6_pkt_prohibit(struct sk_buff *skb)
3755 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3758 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3760 skb->dev = skb_dst(skb)->dev;
3761 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3765 * Allocate a dst for local (unicast / anycast) address.
3768 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3769 struct inet6_dev *idev,
3770 const struct in6_addr *addr,
3771 bool anycast, gfp_t gfp_flags)
3774 struct net_device *dev = idev->dev;
3775 struct fib6_info *f6i;
3777 f6i = fib6_info_alloc(gfp_flags);
3779 return ERR_PTR(-ENOMEM);
3781 f6i->dst_nocount = true;
3782 f6i->dst_host = true;
3783 f6i->fib6_protocol = RTPROT_KERNEL;
3784 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3786 f6i->fib6_type = RTN_ANYCAST;
3787 f6i->fib6_flags |= RTF_ANYCAST;
3789 f6i->fib6_type = RTN_LOCAL;
3790 f6i->fib6_flags |= RTF_LOCAL;
3793 f6i->fib6_nh.nh_gw = *addr;
3795 f6i->fib6_nh.nh_dev = dev;
3796 f6i->fib6_dst.addr = *addr;
3797 f6i->fib6_dst.plen = 128;
3798 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3799 f6i->fib6_table = fib6_get_table(net, tb_id);
3804 /* remove deleted ip from prefsrc entries */
3805 struct arg_dev_net_ip {
3806 struct net_device *dev;
3808 struct in6_addr *addr;
3811 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3813 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3814 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3815 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3817 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3818 rt != net->ipv6.fib6_null_entry &&
3819 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3820 spin_lock_bh(&rt6_exception_lock);
3821 /* remove prefsrc entry */
3822 rt->fib6_prefsrc.plen = 0;
3823 /* need to update cache as well */
3824 rt6_exceptions_remove_prefsrc(rt);
3825 spin_unlock_bh(&rt6_exception_lock);
3830 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3832 struct net *net = dev_net(ifp->idev->dev);
3833 struct arg_dev_net_ip adni = {
3834 .dev = ifp->idev->dev,
3838 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3841 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3843 /* Remove routers and update dst entries when gateway turn into host. */
3844 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3846 struct in6_addr *gateway = (struct in6_addr *)arg;
3848 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3849 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3853 /* Further clean up cached routes in exception table.
3854 * This is needed because cached route may have a different
3855 * gateway than its 'parent' in the case of an ip redirect.
3857 rt6_exceptions_clean_tohost(rt, gateway);
3862 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3864 fib6_clean_all(net, fib6_clean_tohost, gateway);
3867 struct arg_netdev_event {
3868 const struct net_device *dev;
3870 unsigned int nh_flags;
3871 unsigned long event;
3875 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3877 struct fib6_info *iter;
3878 struct fib6_node *fn;
3880 fn = rcu_dereference_protected(rt->fib6_node,
3881 lockdep_is_held(&rt->fib6_table->tb6_lock));
3882 iter = rcu_dereference_protected(fn->leaf,
3883 lockdep_is_held(&rt->fib6_table->tb6_lock));
3885 if (iter->fib6_metric == rt->fib6_metric &&
3886 rt6_qualify_for_ecmp(iter))
3888 iter = rcu_dereference_protected(iter->fib6_next,
3889 lockdep_is_held(&rt->fib6_table->tb6_lock));
3895 static bool rt6_is_dead(const struct fib6_info *rt)
3897 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3898 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3899 fib6_ignore_linkdown(rt)))
3905 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3907 struct fib6_info *iter;
3910 if (!rt6_is_dead(rt))
3911 total += rt->fib6_nh.nh_weight;
3913 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3914 if (!rt6_is_dead(iter))
3915 total += iter->fib6_nh.nh_weight;
3921 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3923 int upper_bound = -1;
3925 if (!rt6_is_dead(rt)) {
3926 *weight += rt->fib6_nh.nh_weight;
3927 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3930 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3933 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3935 struct fib6_info *iter;
3938 rt6_upper_bound_set(rt, &weight, total);
3940 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3941 rt6_upper_bound_set(iter, &weight, total);
3944 void rt6_multipath_rebalance(struct fib6_info *rt)
3946 struct fib6_info *first;
3949 /* In case the entire multipath route was marked for flushing,
3950 * then there is no need to rebalance upon the removal of every
3953 if (!rt->fib6_nsiblings || rt->should_flush)
3956 /* During lookup routes are evaluated in order, so we need to
3957 * make sure upper bounds are assigned from the first sibling
3960 first = rt6_multipath_first_sibling(rt);
3961 if (WARN_ON_ONCE(!first))
3964 total = rt6_multipath_total_weight(first);
3965 rt6_multipath_upper_bound_set(first, total);
3968 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3970 const struct arg_netdev_event *arg = p_arg;
3971 struct net *net = dev_net(arg->dev);
3973 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3974 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3975 fib6_update_sernum_upto_root(net, rt);
3976 rt6_multipath_rebalance(rt);
3982 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3984 struct arg_netdev_event arg = {
3987 .nh_flags = nh_flags,
3991 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3992 arg.nh_flags |= RTNH_F_LINKDOWN;
3994 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3997 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3998 const struct net_device *dev)
4000 struct fib6_info *iter;
4002 if (rt->fib6_nh.nh_dev == dev)
4004 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005 if (iter->fib6_nh.nh_dev == dev)
4011 static void rt6_multipath_flush(struct fib6_info *rt)
4013 struct fib6_info *iter;
4015 rt->should_flush = 1;
4016 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4017 iter->should_flush = 1;
4020 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4021 const struct net_device *down_dev)
4023 struct fib6_info *iter;
4024 unsigned int dead = 0;
4026 if (rt->fib6_nh.nh_dev == down_dev ||
4027 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4029 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4030 if (iter->fib6_nh.nh_dev == down_dev ||
4031 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4037 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4038 const struct net_device *dev,
4039 unsigned int nh_flags)
4041 struct fib6_info *iter;
4043 if (rt->fib6_nh.nh_dev == dev)
4044 rt->fib6_nh.nh_flags |= nh_flags;
4045 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4046 if (iter->fib6_nh.nh_dev == dev)
4047 iter->fib6_nh.nh_flags |= nh_flags;
4050 /* called with write lock held for table with rt */
4051 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4053 const struct arg_netdev_event *arg = p_arg;
4054 const struct net_device *dev = arg->dev;
4055 struct net *net = dev_net(dev);
4057 if (rt == net->ipv6.fib6_null_entry)
4060 switch (arg->event) {
4061 case NETDEV_UNREGISTER:
4062 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4064 if (rt->should_flush)
4066 if (!rt->fib6_nsiblings)
4067 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4068 if (rt6_multipath_uses_dev(rt, dev)) {
4071 count = rt6_multipath_dead_count(rt, dev);
4072 if (rt->fib6_nsiblings + 1 == count) {
4073 rt6_multipath_flush(rt);
4076 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4078 fib6_update_sernum(net, rt);
4079 rt6_multipath_rebalance(rt);
4083 if (rt->fib6_nh.nh_dev != dev ||
4084 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4086 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4087 rt6_multipath_rebalance(rt);
4094 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4096 struct arg_netdev_event arg = {
4103 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4106 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4108 rt6_sync_down_dev(dev, event);
4109 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4110 neigh_ifdown(&nd_tbl, dev);
4113 struct rt6_mtu_change_arg {
4114 struct net_device *dev;
4118 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4120 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4121 struct inet6_dev *idev;
4123 /* In IPv6 pmtu discovery is not optional,
4124 so that RTAX_MTU lock cannot disable it.
4125 We still use this lock to block changes
4126 caused by addrconf/ndisc.
4129 idev = __in6_dev_get(arg->dev);
4133 /* For administrative MTU increase, there is no way to discover
4134 IPv6 PMTU increase, so PMTU increase should be updated here.
4135 Since RFC 1981 doesn't include administrative MTU increase
4136 update PMTU increase is a MUST. (i.e. jumbo frame)
4138 if (rt->fib6_nh.nh_dev == arg->dev &&
4139 !fib6_metric_locked(rt, RTAX_MTU)) {
4140 u32 mtu = rt->fib6_pmtu;
4142 if (mtu >= arg->mtu ||
4143 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4144 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4146 spin_lock_bh(&rt6_exception_lock);
4147 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4148 spin_unlock_bh(&rt6_exception_lock);
4153 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4155 struct rt6_mtu_change_arg arg = {
4160 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4163 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4164 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4165 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4166 [RTA_OIF] = { .type = NLA_U32 },
4167 [RTA_IIF] = { .type = NLA_U32 },
4168 [RTA_PRIORITY] = { .type = NLA_U32 },
4169 [RTA_METRICS] = { .type = NLA_NESTED },
4170 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4171 [RTA_PREF] = { .type = NLA_U8 },
4172 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4173 [RTA_ENCAP] = { .type = NLA_NESTED },
4174 [RTA_EXPIRES] = { .type = NLA_U32 },
4175 [RTA_UID] = { .type = NLA_U32 },
4176 [RTA_MARK] = { .type = NLA_U32 },
4177 [RTA_TABLE] = { .type = NLA_U32 },
4178 [RTA_IP_PROTO] = { .type = NLA_U8 },
4179 [RTA_SPORT] = { .type = NLA_U16 },
4180 [RTA_DPORT] = { .type = NLA_U16 },
4183 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4184 struct fib6_config *cfg,
4185 struct netlink_ext_ack *extack)
4188 struct nlattr *tb[RTA_MAX+1];
4192 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4198 rtm = nlmsg_data(nlh);
4199 memset(cfg, 0, sizeof(*cfg));
4201 cfg->fc_table = rtm->rtm_table;
4202 cfg->fc_dst_len = rtm->rtm_dst_len;
4203 cfg->fc_src_len = rtm->rtm_src_len;
4204 cfg->fc_flags = RTF_UP;
4205 cfg->fc_protocol = rtm->rtm_protocol;
4206 cfg->fc_type = rtm->rtm_type;
4208 if (rtm->rtm_type == RTN_UNREACHABLE ||
4209 rtm->rtm_type == RTN_BLACKHOLE ||
4210 rtm->rtm_type == RTN_PROHIBIT ||
4211 rtm->rtm_type == RTN_THROW)
4212 cfg->fc_flags |= RTF_REJECT;
4214 if (rtm->rtm_type == RTN_LOCAL)
4215 cfg->fc_flags |= RTF_LOCAL;
4217 if (rtm->rtm_flags & RTM_F_CLONED)
4218 cfg->fc_flags |= RTF_CACHE;
4220 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4222 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4223 cfg->fc_nlinfo.nlh = nlh;
4224 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4226 if (tb[RTA_GATEWAY]) {
4227 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4228 cfg->fc_flags |= RTF_GATEWAY;
4231 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4236 int plen = (rtm->rtm_dst_len + 7) >> 3;
4238 if (nla_len(tb[RTA_DST]) < plen)
4241 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4245 int plen = (rtm->rtm_src_len + 7) >> 3;
4247 if (nla_len(tb[RTA_SRC]) < plen)
4250 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4253 if (tb[RTA_PREFSRC])
4254 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4257 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4259 if (tb[RTA_PRIORITY])
4260 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4262 if (tb[RTA_METRICS]) {
4263 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4264 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4268 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4270 if (tb[RTA_MULTIPATH]) {
4271 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4272 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4274 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4275 cfg->fc_mp_len, extack);
4281 pref = nla_get_u8(tb[RTA_PREF]);
4282 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4283 pref != ICMPV6_ROUTER_PREF_HIGH)
4284 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4285 cfg->fc_flags |= RTF_PREF(pref);
4289 cfg->fc_encap = tb[RTA_ENCAP];
4291 if (tb[RTA_ENCAP_TYPE]) {
4292 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4294 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4299 if (tb[RTA_EXPIRES]) {
4300 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4302 if (addrconf_finite_timeout(timeout)) {
4303 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4304 cfg->fc_flags |= RTF_EXPIRES;
4314 struct fib6_info *fib6_info;
4315 struct fib6_config r_cfg;
4316 struct list_head next;
4319 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4323 list_for_each_entry(nh, rt6_nh_list, next) {
4324 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4325 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4326 nh->r_cfg.fc_ifindex);
4330 static int ip6_route_info_append(struct net *net,
4331 struct list_head *rt6_nh_list,
4332 struct fib6_info *rt,
4333 struct fib6_config *r_cfg)
4338 list_for_each_entry(nh, rt6_nh_list, next) {
4339 /* check if fib6_info already exists */
4340 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4344 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4348 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4349 list_add_tail(&nh->next, rt6_nh_list);
4354 static void ip6_route_mpath_notify(struct fib6_info *rt,
4355 struct fib6_info *rt_last,
4356 struct nl_info *info,
4359 /* if this is an APPEND route, then rt points to the first route
4360 * inserted and rt_last points to last route inserted. Userspace
4361 * wants a consistent dump of the route which starts at the first
4362 * nexthop. Since sibling routes are always added at the end of
4363 * the list, find the first sibling of the last route appended
4365 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4366 rt = list_first_entry(&rt_last->fib6_siblings,
4372 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4375 static int ip6_route_multipath_add(struct fib6_config *cfg,
4376 struct netlink_ext_ack *extack)
4378 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4379 struct nl_info *info = &cfg->fc_nlinfo;
4380 struct fib6_config r_cfg;
4381 struct rtnexthop *rtnh;
4382 struct fib6_info *rt;
4383 struct rt6_nh *err_nh;
4384 struct rt6_nh *nh, *nh_safe;
4390 int replace = (cfg->fc_nlinfo.nlh &&
4391 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4392 LIST_HEAD(rt6_nh_list);
4394 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4395 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4396 nlflags |= NLM_F_APPEND;
4398 remaining = cfg->fc_mp_len;
4399 rtnh = (struct rtnexthop *)cfg->fc_mp;
4401 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4402 * fib6_info structs per nexthop
4404 while (rtnh_ok(rtnh, remaining)) {
4405 memcpy(&r_cfg, cfg, sizeof(*cfg));
4406 if (rtnh->rtnh_ifindex)
4407 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4409 attrlen = rtnh_attrlen(rtnh);
4411 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4413 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4415 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4416 r_cfg.fc_flags |= RTF_GATEWAY;
4418 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4419 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4421 r_cfg.fc_encap_type = nla_get_u16(nla);
4424 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4425 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4431 if (!rt6_qualify_for_ecmp(rt)) {
4433 NL_SET_ERR_MSG(extack,
4434 "Device only routes can not be added for IPv6 using the multipath API.");
4435 fib6_info_release(rt);
4439 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4441 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4444 fib6_info_release(rt);
4448 rtnh = rtnh_next(rtnh, &remaining);
4451 /* for add and replace send one notification with all nexthops.
4452 * Skip the notification in fib6_add_rt2node and send one with
4453 * the full route when done
4455 info->skip_notify = 1;
4458 list_for_each_entry(nh, &rt6_nh_list, next) {
4459 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4460 fib6_info_release(nh->fib6_info);
4463 /* save reference to last route successfully inserted */
4464 rt_last = nh->fib6_info;
4466 /* save reference to first route for notification */
4468 rt_notif = nh->fib6_info;
4471 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4472 nh->fib6_info = NULL;
4475 ip6_print_replace_route_err(&rt6_nh_list);
4480 /* Because each route is added like a single route we remove
4481 * these flags after the first nexthop: if there is a collision,
4482 * we have already failed to add the first nexthop:
4483 * fib6_add_rt2node() has rejected it; when replacing, old
4484 * nexthops have been replaced by first new, the rest should
4487 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4492 /* success ... tell user about new route */
4493 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4497 /* send notification for routes that were added so that
4498 * the delete notifications sent by ip6_route_del are
4502 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4504 /* Delete routes that were already added */
4505 list_for_each_entry(nh, &rt6_nh_list, next) {
4508 ip6_route_del(&nh->r_cfg, extack);
4512 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4514 fib6_info_release(nh->fib6_info);
4515 list_del(&nh->next);
4522 static int ip6_route_multipath_del(struct fib6_config *cfg,
4523 struct netlink_ext_ack *extack)
4525 struct fib6_config r_cfg;
4526 struct rtnexthop *rtnh;
4529 int err = 1, last_err = 0;
4531 remaining = cfg->fc_mp_len;
4532 rtnh = (struct rtnexthop *)cfg->fc_mp;
4534 /* Parse a Multipath Entry */
4535 while (rtnh_ok(rtnh, remaining)) {
4536 memcpy(&r_cfg, cfg, sizeof(*cfg));
4537 if (rtnh->rtnh_ifindex)
4538 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4540 attrlen = rtnh_attrlen(rtnh);
4542 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4544 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4546 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4547 r_cfg.fc_flags |= RTF_GATEWAY;
4550 err = ip6_route_del(&r_cfg, extack);
4554 rtnh = rtnh_next(rtnh, &remaining);
4560 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4561 struct netlink_ext_ack *extack)
4563 struct fib6_config cfg;
4566 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4571 return ip6_route_multipath_del(&cfg, extack);
4573 cfg.fc_delete_all_nh = 1;
4574 return ip6_route_del(&cfg, extack);
4578 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4579 struct netlink_ext_ack *extack)
4581 struct fib6_config cfg;
4584 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4589 return ip6_route_multipath_add(&cfg, extack);
4591 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4594 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4596 int nexthop_len = 0;
4598 if (rt->fib6_nsiblings) {
4599 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4600 + NLA_ALIGN(sizeof(struct rtnexthop))
4601 + nla_total_size(16) /* RTA_GATEWAY */
4602 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4604 nexthop_len *= rt->fib6_nsiblings;
4607 return NLMSG_ALIGN(sizeof(struct rtmsg))
4608 + nla_total_size(16) /* RTA_SRC */
4609 + nla_total_size(16) /* RTA_DST */
4610 + nla_total_size(16) /* RTA_GATEWAY */
4611 + nla_total_size(16) /* RTA_PREFSRC */
4612 + nla_total_size(4) /* RTA_TABLE */
4613 + nla_total_size(4) /* RTA_IIF */
4614 + nla_total_size(4) /* RTA_OIF */
4615 + nla_total_size(4) /* RTA_PRIORITY */
4616 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4617 + nla_total_size(sizeof(struct rta_cacheinfo))
4618 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4619 + nla_total_size(1) /* RTA_PREF */
4620 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4624 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4625 unsigned int *flags, bool skip_oif)
4627 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4628 *flags |= RTNH_F_DEAD;
4630 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4631 *flags |= RTNH_F_LINKDOWN;
4634 if (fib6_ignore_linkdown(rt))
4635 *flags |= RTNH_F_DEAD;
4639 if (rt->fib6_flags & RTF_GATEWAY) {
4640 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4641 goto nla_put_failure;
4644 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4645 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4646 *flags |= RTNH_F_OFFLOAD;
4648 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4649 if (!skip_oif && rt->fib6_nh.nh_dev &&
4650 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4651 goto nla_put_failure;
4653 if (rt->fib6_nh.nh_lwtstate &&
4654 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4655 goto nla_put_failure;
4663 /* add multipath next hop */
4664 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4666 const struct net_device *dev = rt->fib6_nh.nh_dev;
4667 struct rtnexthop *rtnh;
4668 unsigned int flags = 0;
4670 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4672 goto nla_put_failure;
4674 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4675 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4677 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4678 goto nla_put_failure;
4680 rtnh->rtnh_flags = flags;
4682 /* length of rtnetlink header + attributes */
4683 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4691 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4692 struct fib6_info *rt, struct dst_entry *dst,
4693 struct in6_addr *dest, struct in6_addr *src,
4694 int iif, int type, u32 portid, u32 seq,
4697 struct rt6_info *rt6 = (struct rt6_info *)dst;
4698 struct rt6key *rt6_dst, *rt6_src;
4699 u32 *pmetrics, table, rt6_flags;
4700 struct nlmsghdr *nlh;
4704 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4709 rt6_dst = &rt6->rt6i_dst;
4710 rt6_src = &rt6->rt6i_src;
4711 rt6_flags = rt6->rt6i_flags;
4713 rt6_dst = &rt->fib6_dst;
4714 rt6_src = &rt->fib6_src;
4715 rt6_flags = rt->fib6_flags;
4718 rtm = nlmsg_data(nlh);
4719 rtm->rtm_family = AF_INET6;
4720 rtm->rtm_dst_len = rt6_dst->plen;
4721 rtm->rtm_src_len = rt6_src->plen;
4724 table = rt->fib6_table->tb6_id;
4726 table = RT6_TABLE_UNSPEC;
4727 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4728 if (nla_put_u32(skb, RTA_TABLE, table))
4729 goto nla_put_failure;
4731 rtm->rtm_type = rt->fib6_type;
4733 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4734 rtm->rtm_protocol = rt->fib6_protocol;
4736 if (rt6_flags & RTF_CACHE)
4737 rtm->rtm_flags |= RTM_F_CLONED;
4740 if (nla_put_in6_addr(skb, RTA_DST, dest))
4741 goto nla_put_failure;
4742 rtm->rtm_dst_len = 128;
4743 } else if (rtm->rtm_dst_len)
4744 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4745 goto nla_put_failure;
4746 #ifdef CONFIG_IPV6_SUBTREES
4748 if (nla_put_in6_addr(skb, RTA_SRC, src))
4749 goto nla_put_failure;
4750 rtm->rtm_src_len = 128;
4751 } else if (rtm->rtm_src_len &&
4752 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4753 goto nla_put_failure;
4756 #ifdef CONFIG_IPV6_MROUTE
4757 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4758 int err = ip6mr_get_route(net, skb, rtm, portid);
4763 goto nla_put_failure;
4766 if (nla_put_u32(skb, RTA_IIF, iif))
4767 goto nla_put_failure;
4769 struct in6_addr saddr_buf;
4770 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4771 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4772 goto nla_put_failure;
4775 if (rt->fib6_prefsrc.plen) {
4776 struct in6_addr saddr_buf;
4777 saddr_buf = rt->fib6_prefsrc.addr;
4778 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4779 goto nla_put_failure;
4782 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4783 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4784 goto nla_put_failure;
4786 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4787 goto nla_put_failure;
4789 /* For multipath routes, walk the siblings list and add
4790 * each as a nexthop within RTA_MULTIPATH.
4793 if (rt6_flags & RTF_GATEWAY &&
4794 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4795 goto nla_put_failure;
4797 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4798 goto nla_put_failure;
4799 } else if (rt->fib6_nsiblings) {
4800 struct fib6_info *sibling, *next_sibling;
4803 mp = nla_nest_start(skb, RTA_MULTIPATH);
4805 goto nla_put_failure;
4807 if (rt6_add_nexthop(skb, rt) < 0)
4808 goto nla_put_failure;
4810 list_for_each_entry_safe(sibling, next_sibling,
4811 &rt->fib6_siblings, fib6_siblings) {
4812 if (rt6_add_nexthop(skb, sibling) < 0)
4813 goto nla_put_failure;
4816 nla_nest_end(skb, mp);
4818 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4819 goto nla_put_failure;
4822 if (rt6_flags & RTF_EXPIRES) {
4823 expires = dst ? dst->expires : rt->expires;
4827 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4828 goto nla_put_failure;
4830 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4831 goto nla_put_failure;
4834 nlmsg_end(skb, nlh);
4838 nlmsg_cancel(skb, nlh);
4842 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4844 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4845 struct net *net = arg->net;
4847 if (rt == net->ipv6.fib6_null_entry)
4850 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4851 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4853 /* user wants prefix routes only */
4854 if (rtm->rtm_flags & RTM_F_PREFIX &&
4855 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4856 /* success since this is not a prefix route */
4861 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4862 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4863 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4866 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4867 struct netlink_ext_ack *extack)
4869 struct net *net = sock_net(in_skb->sk);
4870 struct nlattr *tb[RTA_MAX+1];
4871 int err, iif = 0, oif = 0;
4872 struct fib6_info *from;
4873 struct dst_entry *dst;
4874 struct rt6_info *rt;
4875 struct sk_buff *skb;
4880 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4886 memset(&fl6, 0, sizeof(fl6));
4887 rtm = nlmsg_data(nlh);
4888 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4889 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4892 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4895 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4899 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4902 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4906 iif = nla_get_u32(tb[RTA_IIF]);
4909 oif = nla_get_u32(tb[RTA_OIF]);
4912 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4915 fl6.flowi6_uid = make_kuid(current_user_ns(),
4916 nla_get_u32(tb[RTA_UID]));
4918 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4921 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4924 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4926 if (tb[RTA_IP_PROTO]) {
4927 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4928 &fl6.flowi6_proto, AF_INET6,
4935 struct net_device *dev;
4940 dev = dev_get_by_index_rcu(net, iif);
4947 fl6.flowi6_iif = iif;
4949 if (!ipv6_addr_any(&fl6.saddr))
4950 flags |= RT6_LOOKUP_F_HAS_SADDR;
4952 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4956 fl6.flowi6_oif = oif;
4958 dst = ip6_route_output(net, NULL, &fl6);
4962 rt = container_of(dst, struct rt6_info, dst);
4963 if (rt->dst.error) {
4964 err = rt->dst.error;
4969 if (rt == net->ipv6.ip6_null_entry) {
4970 err = rt->dst.error;
4975 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4982 skb_dst_set(skb, &rt->dst);
4985 from = rcu_dereference(rt->from);
4988 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
4990 NETLINK_CB(in_skb).portid,
4993 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4994 &fl6.saddr, iif, RTM_NEWROUTE,
4995 NETLINK_CB(in_skb).portid,
5007 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5012 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5013 unsigned int nlm_flags)
5015 struct sk_buff *skb;
5016 struct net *net = info->nl_net;
5021 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5023 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5027 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5028 event, info->portid, seq, nlm_flags);
5030 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5031 WARN_ON(err == -EMSGSIZE);
5035 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5036 info->nlh, gfp_any());
5040 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5043 static int ip6_route_dev_notify(struct notifier_block *this,
5044 unsigned long event, void *ptr)
5046 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5047 struct net *net = dev_net(dev);
5049 if (!(dev->flags & IFF_LOOPBACK))
5052 if (event == NETDEV_REGISTER) {
5053 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5054 net->ipv6.ip6_null_entry->dst.dev = dev;
5055 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5056 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5057 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5058 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5059 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5060 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5062 } else if (event == NETDEV_UNREGISTER &&
5063 dev->reg_state != NETREG_UNREGISTERED) {
5064 /* NETDEV_UNREGISTER could be fired for multiple times by
5065 * netdev_wait_allrefs(). Make sure we only call this once.
5067 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5068 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5069 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5070 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5081 #ifdef CONFIG_PROC_FS
5082 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5084 struct net *net = (struct net *)seq->private;
5085 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5086 net->ipv6.rt6_stats->fib_nodes,
5087 net->ipv6.rt6_stats->fib_route_nodes,
5088 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5089 net->ipv6.rt6_stats->fib_rt_entries,
5090 net->ipv6.rt6_stats->fib_rt_cache,
5091 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5092 net->ipv6.rt6_stats->fib_discarded_routes);
5096 #endif /* CONFIG_PROC_FS */
5098 #ifdef CONFIG_SYSCTL
5101 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5102 void __user *buffer, size_t *lenp, loff_t *ppos)
5109 net = (struct net *)ctl->extra1;
5110 delay = net->ipv6.sysctl.flush_delay;
5111 proc_dointvec(ctl, write, buffer, lenp, ppos);
5112 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5116 struct ctl_table ipv6_route_table_template[] = {
5118 .procname = "flush",
5119 .data = &init_net.ipv6.sysctl.flush_delay,
5120 .maxlen = sizeof(int),
5122 .proc_handler = ipv6_sysctl_rtcache_flush
5125 .procname = "gc_thresh",
5126 .data = &ip6_dst_ops_template.gc_thresh,
5127 .maxlen = sizeof(int),
5129 .proc_handler = proc_dointvec,
5132 .procname = "max_size",
5133 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5134 .maxlen = sizeof(int),
5136 .proc_handler = proc_dointvec,
5139 .procname = "gc_min_interval",
5140 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5141 .maxlen = sizeof(int),
5143 .proc_handler = proc_dointvec_jiffies,
5146 .procname = "gc_timeout",
5147 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5148 .maxlen = sizeof(int),
5150 .proc_handler = proc_dointvec_jiffies,
5153 .procname = "gc_interval",
5154 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5155 .maxlen = sizeof(int),
5157 .proc_handler = proc_dointvec_jiffies,
5160 .procname = "gc_elasticity",
5161 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5162 .maxlen = sizeof(int),
5164 .proc_handler = proc_dointvec,
5167 .procname = "mtu_expires",
5168 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5169 .maxlen = sizeof(int),
5171 .proc_handler = proc_dointvec_jiffies,
5174 .procname = "min_adv_mss",
5175 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5176 .maxlen = sizeof(int),
5178 .proc_handler = proc_dointvec,
5181 .procname = "gc_min_interval_ms",
5182 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5183 .maxlen = sizeof(int),
5185 .proc_handler = proc_dointvec_ms_jiffies,
5190 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5192 struct ctl_table *table;
5194 table = kmemdup(ipv6_route_table_template,
5195 sizeof(ipv6_route_table_template),
5199 table[0].data = &net->ipv6.sysctl.flush_delay;
5200 table[0].extra1 = net;
5201 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5202 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5203 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5204 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5205 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5206 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5207 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5208 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5209 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5211 /* Don't export sysctls to unprivileged users */
5212 if (net->user_ns != &init_user_ns)
5213 table[0].procname = NULL;
5220 static int __net_init ip6_route_net_init(struct net *net)
5224 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5225 sizeof(net->ipv6.ip6_dst_ops));
5227 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5228 goto out_ip6_dst_ops;
5230 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5231 sizeof(*net->ipv6.fib6_null_entry),
5233 if (!net->ipv6.fib6_null_entry)
5234 goto out_ip6_dst_entries;
5236 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5237 sizeof(*net->ipv6.ip6_null_entry),
5239 if (!net->ipv6.ip6_null_entry)
5240 goto out_fib6_null_entry;
5241 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5242 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5243 ip6_template_metrics, true);
5245 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5246 net->ipv6.fib6_has_custom_rules = false;
5247 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5248 sizeof(*net->ipv6.ip6_prohibit_entry),
5250 if (!net->ipv6.ip6_prohibit_entry)
5251 goto out_ip6_null_entry;
5252 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5253 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5254 ip6_template_metrics, true);
5256 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5257 sizeof(*net->ipv6.ip6_blk_hole_entry),
5259 if (!net->ipv6.ip6_blk_hole_entry)
5260 goto out_ip6_prohibit_entry;
5261 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5262 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5263 ip6_template_metrics, true);
5266 net->ipv6.sysctl.flush_delay = 0;
5267 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5268 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5269 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5270 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5271 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5272 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5273 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5275 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5281 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5282 out_ip6_prohibit_entry:
5283 kfree(net->ipv6.ip6_prohibit_entry);
5285 kfree(net->ipv6.ip6_null_entry);
5287 out_fib6_null_entry:
5288 kfree(net->ipv6.fib6_null_entry);
5289 out_ip6_dst_entries:
5290 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5295 static void __net_exit ip6_route_net_exit(struct net *net)
5297 kfree(net->ipv6.fib6_null_entry);
5298 kfree(net->ipv6.ip6_null_entry);
5299 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5300 kfree(net->ipv6.ip6_prohibit_entry);
5301 kfree(net->ipv6.ip6_blk_hole_entry);
5303 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5306 static int __net_init ip6_route_net_init_late(struct net *net)
5308 #ifdef CONFIG_PROC_FS
5309 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5310 sizeof(struct ipv6_route_iter));
5311 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5312 rt6_stats_seq_show, NULL);
5317 static void __net_exit ip6_route_net_exit_late(struct net *net)
5319 #ifdef CONFIG_PROC_FS
5320 remove_proc_entry("ipv6_route", net->proc_net);
5321 remove_proc_entry("rt6_stats", net->proc_net);
5325 static struct pernet_operations ip6_route_net_ops = {
5326 .init = ip6_route_net_init,
5327 .exit = ip6_route_net_exit,
5330 static int __net_init ipv6_inetpeer_init(struct net *net)
5332 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5336 inet_peer_base_init(bp);
5337 net->ipv6.peers = bp;
5341 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5343 struct inet_peer_base *bp = net->ipv6.peers;
5345 net->ipv6.peers = NULL;
5346 inetpeer_invalidate_tree(bp);
5350 static struct pernet_operations ipv6_inetpeer_ops = {
5351 .init = ipv6_inetpeer_init,
5352 .exit = ipv6_inetpeer_exit,
5355 static struct pernet_operations ip6_route_net_late_ops = {
5356 .init = ip6_route_net_init_late,
5357 .exit = ip6_route_net_exit_late,
5360 static struct notifier_block ip6_route_dev_notifier = {
5361 .notifier_call = ip6_route_dev_notify,
5362 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5365 void __init ip6_route_init_special_entries(void)
5367 /* Registering of the loopback is done before this portion of code,
5368 * the loopback reference in rt6_info will not be taken, do it
5369 * manually for init_net */
5370 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5371 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5372 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5373 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5374 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5375 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5376 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5377 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5381 int __init ip6_route_init(void)
5387 ip6_dst_ops_template.kmem_cachep =
5388 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5389 SLAB_HWCACHE_ALIGN, NULL);
5390 if (!ip6_dst_ops_template.kmem_cachep)
5393 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5395 goto out_kmem_cache;
5397 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5399 goto out_dst_entries;
5401 ret = register_pernet_subsys(&ip6_route_net_ops);
5403 goto out_register_inetpeer;
5405 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5409 goto out_register_subsys;
5415 ret = fib6_rules_init();
5419 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5421 goto fib6_rules_init;
5423 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5424 inet6_rtm_newroute, NULL, 0);
5426 goto out_register_late_subsys;
5428 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5429 inet6_rtm_delroute, NULL, 0);
5431 goto out_register_late_subsys;
5433 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5434 inet6_rtm_getroute, NULL,
5435 RTNL_FLAG_DOIT_UNLOCKED);
5437 goto out_register_late_subsys;
5439 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5441 goto out_register_late_subsys;
5443 for_each_possible_cpu(cpu) {
5444 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5446 INIT_LIST_HEAD(&ul->head);
5447 spin_lock_init(&ul->lock);
5453 out_register_late_subsys:
5454 rtnl_unregister_all(PF_INET6);
5455 unregister_pernet_subsys(&ip6_route_net_late_ops);
5457 fib6_rules_cleanup();
5462 out_register_subsys:
5463 unregister_pernet_subsys(&ip6_route_net_ops);
5464 out_register_inetpeer:
5465 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5467 dst_entries_destroy(&ip6_dst_blackhole_ops);
5469 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5473 void ip6_route_cleanup(void)
5475 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5476 unregister_pernet_subsys(&ip6_route_net_late_ops);
5477 fib6_rules_cleanup();
5480 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5481 unregister_pernet_subsys(&ip6_route_net_ops);
5482 dst_entries_destroy(&ip6_dst_blackhole_ops);
5483 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);