ced2c9bed10b8e73d587efae954e53eb51fba37e
[platform/kernel/linux-rpi.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 struct inet6_dev *idev = sibling->rt6i_idev;
476
477                                 if (sibling->rt6i_nh_flags & RTNH_F_DEAD)
478                                         break;
479                                 if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
480                                     idev->cnf.ignore_routes_with_linkdown)
481                                         break;
482                                 if (rt6_score_route(sibling, oif, strict) < 0)
483                                         break;
484                                 match = sibling;
485                                 break;
486                         }
487                 }
488         return match;
489 }
490
491 /*
492  *      Route lookup. rcu_read_lock() should be held.
493  */
494
495 static inline struct rt6_info *rt6_device_match(struct net *net,
496                                                     struct rt6_info *rt,
497                                                     const struct in6_addr *saddr,
498                                                     int oif,
499                                                     int flags)
500 {
501         struct rt6_info *local = NULL;
502         struct rt6_info *sprt;
503
504         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
505                 return rt;
506
507         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
508                 struct net_device *dev = sprt->dst.dev;
509
510                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
511                         continue;
512
513                 if (oif) {
514                         if (dev->ifindex == oif)
515                                 return sprt;
516                         if (dev->flags & IFF_LOOPBACK) {
517                                 if (!sprt->rt6i_idev ||
518                                     sprt->rt6i_idev->dev->ifindex != oif) {
519                                         if (flags & RT6_LOOKUP_F_IFACE)
520                                                 continue;
521                                         if (local &&
522                                             local->rt6i_idev->dev->ifindex == oif)
523                                                 continue;
524                                 }
525                                 local = sprt;
526                         }
527                 } else {
528                         if (ipv6_chk_addr(net, saddr, dev,
529                                           flags & RT6_LOOKUP_F_IFACE))
530                                 return sprt;
531                 }
532         }
533
534         if (oif) {
535                 if (local)
536                         return local;
537
538                 if (flags & RT6_LOOKUP_F_IFACE)
539                         return net->ipv6.ip6_null_entry;
540         }
541
542         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
543 }
544
545 #ifdef CONFIG_IPV6_ROUTER_PREF
546 struct __rt6_probe_work {
547         struct work_struct work;
548         struct in6_addr target;
549         struct net_device *dev;
550 };
551
552 static void rt6_probe_deferred(struct work_struct *w)
553 {
554         struct in6_addr mcaddr;
555         struct __rt6_probe_work *work =
556                 container_of(w, struct __rt6_probe_work, work);
557
558         addrconf_addr_solict_mult(&work->target, &mcaddr);
559         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560         dev_put(work->dev);
561         kfree(work);
562 }
563
564 static void rt6_probe(struct rt6_info *rt)
565 {
566         struct __rt6_probe_work *work;
567         struct neighbour *neigh;
568         /*
569          * Okay, this does not seem to be appropriate
570          * for now, however, we need to check if it
571          * is really so; aka Router Reachability Probing.
572          *
573          * Router Reachability Probe MUST be rate-limited
574          * to no more than one per minute.
575          */
576         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
577                 return;
578         rcu_read_lock_bh();
579         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
580         if (neigh) {
581                 if (neigh->nud_state & NUD_VALID)
582                         goto out;
583
584                 work = NULL;
585                 write_lock(&neigh->lock);
586                 if (!(neigh->nud_state & NUD_VALID) &&
587                     time_after(jiffies,
588                                neigh->updated +
589                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
590                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
591                         if (work)
592                                 __neigh_set_probe_once(neigh);
593                 }
594                 write_unlock(&neigh->lock);
595         } else {
596                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
597         }
598
599         if (work) {
600                 INIT_WORK(&work->work, rt6_probe_deferred);
601                 work->target = rt->rt6i_gateway;
602                 dev_hold(rt->dst.dev);
603                 work->dev = rt->dst.dev;
604                 schedule_work(&work->work);
605         }
606
607 out:
608         rcu_read_unlock_bh();
609 }
610 #else
611 static inline void rt6_probe(struct rt6_info *rt)
612 {
613 }
614 #endif
615
616 /*
617  * Default Router Selection (RFC 2461 6.3.6)
618  */
619 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
620 {
621         struct net_device *dev = rt->dst.dev;
622         if (!oif || dev->ifindex == oif)
623                 return 2;
624         if ((dev->flags & IFF_LOOPBACK) &&
625             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626                 return 1;
627         return 0;
628 }
629
630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
631 {
632         struct neighbour *neigh;
633         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
634
635         if (rt->rt6i_flags & RTF_NONEXTHOP ||
636             !(rt->rt6i_flags & RTF_GATEWAY))
637                 return RT6_NUD_SUCCEED;
638
639         rcu_read_lock_bh();
640         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
641         if (neigh) {
642                 read_lock(&neigh->lock);
643                 if (neigh->nud_state & NUD_VALID)
644                         ret = RT6_NUD_SUCCEED;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646                 else if (!(neigh->nud_state & NUD_FAILED))
647                         ret = RT6_NUD_SUCCEED;
648                 else
649                         ret = RT6_NUD_FAIL_PROBE;
650 #endif
651                 read_unlock(&neigh->lock);
652         } else {
653                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
654                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
655         }
656         rcu_read_unlock_bh();
657
658         return ret;
659 }
660
661 static int rt6_score_route(struct rt6_info *rt, int oif,
662                            int strict)
663 {
664         int m;
665
666         m = rt6_check_dev(rt, oif);
667         if (!m && (strict & RT6_LOOKUP_F_IFACE))
668                 return RT6_NUD_FAIL_HARD;
669 #ifdef CONFIG_IPV6_ROUTER_PREF
670         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
671 #endif
672         if (strict & RT6_LOOKUP_F_REACHABLE) {
673                 int n = rt6_check_neigh(rt);
674                 if (n < 0)
675                         return n;
676         }
677         return m;
678 }
679
680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
681                                    int *mpri, struct rt6_info *match,
682                                    bool *do_rr)
683 {
684         int m;
685         bool match_do_rr = false;
686         struct inet6_dev *idev = rt->rt6i_idev;
687
688         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
689                 goto out;
690
691         if (idev->cnf.ignore_routes_with_linkdown &&
692             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
693             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
694                 goto out;
695
696         if (rt6_check_expired(rt))
697                 goto out;
698
699         m = rt6_score_route(rt, oif, strict);
700         if (m == RT6_NUD_FAIL_DO_RR) {
701                 match_do_rr = true;
702                 m = 0; /* lowest valid score */
703         } else if (m == RT6_NUD_FAIL_HARD) {
704                 goto out;
705         }
706
707         if (strict & RT6_LOOKUP_F_REACHABLE)
708                 rt6_probe(rt);
709
710         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
711         if (m > *mpri) {
712                 *do_rr = match_do_rr;
713                 *mpri = m;
714                 match = rt;
715         }
716 out:
717         return match;
718 }
719
720 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
721                                      struct rt6_info *leaf,
722                                      struct rt6_info *rr_head,
723                                      u32 metric, int oif, int strict,
724                                      bool *do_rr)
725 {
726         struct rt6_info *rt, *match, *cont;
727         int mpri = -1;
728
729         match = NULL;
730         cont = NULL;
731         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
732                 if (rt->rt6i_metric != metric) {
733                         cont = rt;
734                         break;
735                 }
736
737                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738         }
739
740         for (rt = leaf; rt && rt != rr_head;
741              rt = rcu_dereference(rt->rt6_next)) {
742                 if (rt->rt6i_metric != metric) {
743                         cont = rt;
744                         break;
745                 }
746
747                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
748         }
749
750         if (match || !cont)
751                 return match;
752
753         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
754                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
755
756         return match;
757 }
758
759 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
760                                    int oif, int strict)
761 {
762         struct rt6_info *leaf = rcu_dereference(fn->leaf);
763         struct rt6_info *match, *rt0;
764         bool do_rr = false;
765         int key_plen;
766
767         if (!leaf || leaf == net->ipv6.ip6_null_entry)
768                 return net->ipv6.ip6_null_entry;
769
770         rt0 = rcu_dereference(fn->rr_ptr);
771         if (!rt0)
772                 rt0 = leaf;
773
774         /* Double check to make sure fn is not an intermediate node
775          * and fn->leaf does not points to its child's leaf
776          * (This might happen if all routes under fn are deleted from
777          * the tree and fib6_repair_tree() is called on the node.)
778          */
779         key_plen = rt0->rt6i_dst.plen;
780 #ifdef CONFIG_IPV6_SUBTREES
781         if (rt0->rt6i_src.plen)
782                 key_plen = rt0->rt6i_src.plen;
783 #endif
784         if (fn->fn_bit != key_plen)
785                 return net->ipv6.ip6_null_entry;
786
787         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
788                              &do_rr);
789
790         if (do_rr) {
791                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
792
793                 /* no entries matched; do round-robin */
794                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
795                         next = leaf;
796
797                 if (next != rt0) {
798                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
799                         /* make sure next is not being deleted from the tree */
800                         if (next->rt6i_node)
801                                 rcu_assign_pointer(fn->rr_ptr, next);
802                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
803                 }
804         }
805
806         return match ? match : net->ipv6.ip6_null_entry;
807 }
808
809 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
810 {
811         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
812 }
813
814 #ifdef CONFIG_IPV6_ROUTE_INFO
815 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
816                   const struct in6_addr *gwaddr)
817 {
818         struct net *net = dev_net(dev);
819         struct route_info *rinfo = (struct route_info *) opt;
820         struct in6_addr prefix_buf, *prefix;
821         unsigned int pref;
822         unsigned long lifetime;
823         struct rt6_info *rt;
824
825         if (len < sizeof(struct route_info)) {
826                 return -EINVAL;
827         }
828
829         /* Sanity check for prefix_len and length */
830         if (rinfo->length > 3) {
831                 return -EINVAL;
832         } else if (rinfo->prefix_len > 128) {
833                 return -EINVAL;
834         } else if (rinfo->prefix_len > 64) {
835                 if (rinfo->length < 2) {
836                         return -EINVAL;
837                 }
838         } else if (rinfo->prefix_len > 0) {
839                 if (rinfo->length < 1) {
840                         return -EINVAL;
841                 }
842         }
843
844         pref = rinfo->route_pref;
845         if (pref == ICMPV6_ROUTER_PREF_INVALID)
846                 return -EINVAL;
847
848         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
849
850         if (rinfo->length == 3)
851                 prefix = (struct in6_addr *)rinfo->prefix;
852         else {
853                 /* this function is safe */
854                 ipv6_addr_prefix(&prefix_buf,
855                                  (struct in6_addr *)rinfo->prefix,
856                                  rinfo->prefix_len);
857                 prefix = &prefix_buf;
858         }
859
860         if (rinfo->prefix_len == 0)
861                 rt = rt6_get_dflt_router(gwaddr, dev);
862         else
863                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
864                                         gwaddr, dev);
865
866         if (rt && !lifetime) {
867                 ip6_del_rt(rt);
868                 rt = NULL;
869         }
870
871         if (!rt && lifetime)
872                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
873                                         dev, pref);
874         else if (rt)
875                 rt->rt6i_flags = RTF_ROUTEINFO |
876                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
877
878         if (rt) {
879                 if (!addrconf_finite_timeout(lifetime))
880                         rt6_clean_expires(rt);
881                 else
882                         rt6_set_expires(rt, jiffies + HZ * lifetime);
883
884                 ip6_rt_put(rt);
885         }
886         return 0;
887 }
888 #endif
889
890 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
891                                         struct in6_addr *saddr)
892 {
893         struct fib6_node *pn, *sn;
894         while (1) {
895                 if (fn->fn_flags & RTN_TL_ROOT)
896                         return NULL;
897                 pn = rcu_dereference(fn->parent);
898                 sn = FIB6_SUBTREE(pn);
899                 if (sn && sn != fn)
900                         fn = fib6_lookup(sn, NULL, saddr);
901                 else
902                         fn = pn;
903                 if (fn->fn_flags & RTN_RTINFO)
904                         return fn;
905         }
906 }
907
908 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
909                           bool null_fallback)
910 {
911         struct rt6_info *rt = *prt;
912
913         if (dst_hold_safe(&rt->dst))
914                 return true;
915         if (null_fallback) {
916                 rt = net->ipv6.ip6_null_entry;
917                 dst_hold(&rt->dst);
918         } else {
919                 rt = NULL;
920         }
921         *prt = rt;
922         return false;
923 }
924
925 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
926                                              struct fib6_table *table,
927                                              struct flowi6 *fl6, int flags)
928 {
929         struct rt6_info *rt, *rt_cache;
930         struct fib6_node *fn;
931
932         rcu_read_lock();
933         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
934 restart:
935         rt = rcu_dereference(fn->leaf);
936         if (!rt) {
937                 rt = net->ipv6.ip6_null_entry;
938         } else {
939                 rt = rt6_device_match(net, rt, &fl6->saddr,
940                                       fl6->flowi6_oif, flags);
941                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
942                         rt = rt6_multipath_select(rt, fl6,
943                                                   fl6->flowi6_oif, flags);
944         }
945         if (rt == net->ipv6.ip6_null_entry) {
946                 fn = fib6_backtrack(fn, &fl6->saddr);
947                 if (fn)
948                         goto restart;
949         }
950         /* Search through exception table */
951         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
952         if (rt_cache)
953                 rt = rt_cache;
954
955         if (ip6_hold_safe(net, &rt, true))
956                 dst_use_noref(&rt->dst, jiffies);
957
958         rcu_read_unlock();
959
960         trace_fib6_table_lookup(net, rt, table, fl6);
961
962         return rt;
963
964 }
965
966 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
967                                     int flags)
968 {
969         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
970 }
971 EXPORT_SYMBOL_GPL(ip6_route_lookup);
972
973 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
974                             const struct in6_addr *saddr, int oif, int strict)
975 {
976         struct flowi6 fl6 = {
977                 .flowi6_oif = oif,
978                 .daddr = *daddr,
979         };
980         struct dst_entry *dst;
981         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
982
983         if (saddr) {
984                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
985                 flags |= RT6_LOOKUP_F_HAS_SADDR;
986         }
987
988         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
989         if (dst->error == 0)
990                 return (struct rt6_info *) dst;
991
992         dst_release(dst);
993
994         return NULL;
995 }
996 EXPORT_SYMBOL(rt6_lookup);
997
998 /* ip6_ins_rt is called with FREE table->tb6_lock.
999  * It takes new route entry, the addition fails by any reason the
1000  * route is released.
1001  * Caller must hold dst before calling it.
1002  */
1003
1004 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1005                         struct mx6_config *mxc,
1006                         struct netlink_ext_ack *extack)
1007 {
1008         int err;
1009         struct fib6_table *table;
1010
1011         table = rt->rt6i_table;
1012         spin_lock_bh(&table->tb6_lock);
1013         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1014         spin_unlock_bh(&table->tb6_lock);
1015
1016         return err;
1017 }
1018
1019 int ip6_ins_rt(struct rt6_info *rt)
1020 {
1021         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1022         struct mx6_config mxc = { .mx = NULL, };
1023
1024         /* Hold dst to account for the reference from the fib6 tree */
1025         dst_hold(&rt->dst);
1026         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1027 }
1028
1029 /* called with rcu_lock held */
1030 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1031 {
1032         struct net_device *dev = rt->dst.dev;
1033
1034         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1035                 /* for copies of local routes, dst->dev needs to be the
1036                  * device if it is a master device, the master device if
1037                  * device is enslaved, and the loopback as the default
1038                  */
1039                 if (netif_is_l3_slave(dev) &&
1040                     !rt6_need_strict(&rt->rt6i_dst.addr))
1041                         dev = l3mdev_master_dev_rcu(dev);
1042                 else if (!netif_is_l3_master(dev))
1043                         dev = dev_net(dev)->loopback_dev;
1044                 /* last case is netif_is_l3_master(dev) is true in which
1045                  * case we want dev returned to be dev
1046                  */
1047         }
1048
1049         return dev;
1050 }
1051
1052 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1053                                            const struct in6_addr *daddr,
1054                                            const struct in6_addr *saddr)
1055 {
1056         struct net_device *dev;
1057         struct rt6_info *rt;
1058
1059         /*
1060          *      Clone the route.
1061          */
1062
1063         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1064                 ort = ort->from;
1065
1066         rcu_read_lock();
1067         dev = ip6_rt_get_dev_rcu(ort);
1068         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1069         rcu_read_unlock();
1070         if (!rt)
1071                 return NULL;
1072
1073         ip6_rt_copy_init(rt, ort);
1074         rt->rt6i_flags |= RTF_CACHE;
1075         rt->rt6i_metric = 0;
1076         rt->dst.flags |= DST_HOST;
1077         rt->rt6i_dst.addr = *daddr;
1078         rt->rt6i_dst.plen = 128;
1079
1080         if (!rt6_is_gw_or_nonexthop(ort)) {
1081                 if (ort->rt6i_dst.plen != 128 &&
1082                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1083                         rt->rt6i_flags |= RTF_ANYCAST;
1084 #ifdef CONFIG_IPV6_SUBTREES
1085                 if (rt->rt6i_src.plen && saddr) {
1086                         rt->rt6i_src.addr = *saddr;
1087                         rt->rt6i_src.plen = 128;
1088                 }
1089 #endif
1090         }
1091
1092         return rt;
1093 }
1094
1095 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1096 {
1097         struct net_device *dev;
1098         struct rt6_info *pcpu_rt;
1099
1100         rcu_read_lock();
1101         dev = ip6_rt_get_dev_rcu(rt);
1102         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1103         rcu_read_unlock();
1104         if (!pcpu_rt)
1105                 return NULL;
1106         ip6_rt_copy_init(pcpu_rt, rt);
1107         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1108         pcpu_rt->rt6i_flags |= RTF_PCPU;
1109         return pcpu_rt;
1110 }
1111
1112 /* It should be called with rcu_read_lock() acquired */
1113 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1114 {
1115         struct rt6_info *pcpu_rt, **p;
1116
1117         p = this_cpu_ptr(rt->rt6i_pcpu);
1118         pcpu_rt = *p;
1119
1120         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1121                 rt6_dst_from_metrics_check(pcpu_rt);
1122
1123         return pcpu_rt;
1124 }
1125
1126 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1127 {
1128         struct rt6_info *pcpu_rt, *prev, **p;
1129
1130         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1131         if (!pcpu_rt) {
1132                 struct net *net = dev_net(rt->dst.dev);
1133
1134                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1135                 return net->ipv6.ip6_null_entry;
1136         }
1137
1138         dst_hold(&pcpu_rt->dst);
1139         p = this_cpu_ptr(rt->rt6i_pcpu);
1140         prev = cmpxchg(p, NULL, pcpu_rt);
1141         BUG_ON(prev);
1142
1143         rt6_dst_from_metrics_check(pcpu_rt);
1144         return pcpu_rt;
1145 }
1146
1147 /* exception hash table implementation
1148  */
1149 static DEFINE_SPINLOCK(rt6_exception_lock);
1150
1151 /* Remove rt6_ex from hash table and free the memory
1152  * Caller must hold rt6_exception_lock
1153  */
1154 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1155                                  struct rt6_exception *rt6_ex)
1156 {
1157         struct net *net;
1158
1159         if (!bucket || !rt6_ex)
1160                 return;
1161
1162         net = dev_net(rt6_ex->rt6i->dst.dev);
1163         rt6_ex->rt6i->rt6i_node = NULL;
1164         hlist_del_rcu(&rt6_ex->hlist);
1165         rt6_release(rt6_ex->rt6i);
1166         kfree_rcu(rt6_ex, rcu);
1167         WARN_ON_ONCE(!bucket->depth);
1168         bucket->depth--;
1169         net->ipv6.rt6_stats->fib_rt_cache--;
1170 }
1171
1172 /* Remove oldest rt6_ex in bucket and free the memory
1173  * Caller must hold rt6_exception_lock
1174  */
1175 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1176 {
1177         struct rt6_exception *rt6_ex, *oldest = NULL;
1178
1179         if (!bucket)
1180                 return;
1181
1182         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1183                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1184                         oldest = rt6_ex;
1185         }
1186         rt6_remove_exception(bucket, oldest);
1187 }
1188
1189 static u32 rt6_exception_hash(const struct in6_addr *dst,
1190                               const struct in6_addr *src)
1191 {
1192         static u32 seed __read_mostly;
1193         u32 val;
1194
1195         net_get_random_once(&seed, sizeof(seed));
1196         val = jhash(dst, sizeof(*dst), seed);
1197
1198 #ifdef CONFIG_IPV6_SUBTREES
1199         if (src)
1200                 val = jhash(src, sizeof(*src), val);
1201 #endif
1202         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1203 }
1204
1205 /* Helper function to find the cached rt in the hash table
1206  * and update bucket pointer to point to the bucket for this
1207  * (daddr, saddr) pair
1208  * Caller must hold rt6_exception_lock
1209  */
1210 static struct rt6_exception *
1211 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1212                               const struct in6_addr *daddr,
1213                               const struct in6_addr *saddr)
1214 {
1215         struct rt6_exception *rt6_ex;
1216         u32 hval;
1217
1218         if (!(*bucket) || !daddr)
1219                 return NULL;
1220
1221         hval = rt6_exception_hash(daddr, saddr);
1222         *bucket += hval;
1223
1224         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1225                 struct rt6_info *rt6 = rt6_ex->rt6i;
1226                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1227
1228 #ifdef CONFIG_IPV6_SUBTREES
1229                 if (matched && saddr)
1230                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1231 #endif
1232                 if (matched)
1233                         return rt6_ex;
1234         }
1235         return NULL;
1236 }
1237
1238 /* Helper function to find the cached rt in the hash table
1239  * and update bucket pointer to point to the bucket for this
1240  * (daddr, saddr) pair
1241  * Caller must hold rcu_read_lock()
1242  */
1243 static struct rt6_exception *
1244 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1245                          const struct in6_addr *daddr,
1246                          const struct in6_addr *saddr)
1247 {
1248         struct rt6_exception *rt6_ex;
1249         u32 hval;
1250
1251         WARN_ON_ONCE(!rcu_read_lock_held());
1252
1253         if (!(*bucket) || !daddr)
1254                 return NULL;
1255
1256         hval = rt6_exception_hash(daddr, saddr);
1257         *bucket += hval;
1258
1259         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1260                 struct rt6_info *rt6 = rt6_ex->rt6i;
1261                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1262
1263 #ifdef CONFIG_IPV6_SUBTREES
1264                 if (matched && saddr)
1265                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1266 #endif
1267                 if (matched)
1268                         return rt6_ex;
1269         }
1270         return NULL;
1271 }
1272
1273 static int rt6_insert_exception(struct rt6_info *nrt,
1274                                 struct rt6_info *ort)
1275 {
1276         struct net *net = dev_net(ort->dst.dev);
1277         struct rt6_exception_bucket *bucket;
1278         struct in6_addr *src_key = NULL;
1279         struct rt6_exception *rt6_ex;
1280         int err = 0;
1281
1282         /* ort can't be a cache or pcpu route */
1283         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1284                 ort = ort->from;
1285         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1286
1287         spin_lock_bh(&rt6_exception_lock);
1288
1289         if (ort->exception_bucket_flushed) {
1290                 err = -EINVAL;
1291                 goto out;
1292         }
1293
1294         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1295                                         lockdep_is_held(&rt6_exception_lock));
1296         if (!bucket) {
1297                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1298                                  GFP_ATOMIC);
1299                 if (!bucket) {
1300                         err = -ENOMEM;
1301                         goto out;
1302                 }
1303                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1304         }
1305
1306 #ifdef CONFIG_IPV6_SUBTREES
1307         /* rt6i_src.plen != 0 indicates ort is in subtree
1308          * and exception table is indexed by a hash of
1309          * both rt6i_dst and rt6i_src.
1310          * Otherwise, the exception table is indexed by
1311          * a hash of only rt6i_dst.
1312          */
1313         if (ort->rt6i_src.plen)
1314                 src_key = &nrt->rt6i_src.addr;
1315 #endif
1316
1317         /* Update rt6i_prefsrc as it could be changed
1318          * in rt6_remove_prefsrc()
1319          */
1320         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1321         /* rt6_mtu_change() might lower mtu on ort.
1322          * Only insert this exception route if its mtu
1323          * is less than ort's mtu value.
1324          */
1325         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1326                 err = -EINVAL;
1327                 goto out;
1328         }
1329
1330         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1331                                                src_key);
1332         if (rt6_ex)
1333                 rt6_remove_exception(bucket, rt6_ex);
1334
1335         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1336         if (!rt6_ex) {
1337                 err = -ENOMEM;
1338                 goto out;
1339         }
1340         rt6_ex->rt6i = nrt;
1341         rt6_ex->stamp = jiffies;
1342         atomic_inc(&nrt->rt6i_ref);
1343         nrt->rt6i_node = ort->rt6i_node;
1344         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1345         bucket->depth++;
1346         net->ipv6.rt6_stats->fib_rt_cache++;
1347
1348         if (bucket->depth > FIB6_MAX_DEPTH)
1349                 rt6_exception_remove_oldest(bucket);
1350
1351 out:
1352         spin_unlock_bh(&rt6_exception_lock);
1353
1354         /* Update fn->fn_sernum to invalidate all cached dst */
1355         if (!err) {
1356                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1357                 fib6_update_sernum(ort);
1358                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1359                 fib6_force_start_gc(net);
1360         }
1361
1362         return err;
1363 }
1364
1365 void rt6_flush_exceptions(struct rt6_info *rt)
1366 {
1367         struct rt6_exception_bucket *bucket;
1368         struct rt6_exception *rt6_ex;
1369         struct hlist_node *tmp;
1370         int i;
1371
1372         spin_lock_bh(&rt6_exception_lock);
1373         /* Prevent rt6_insert_exception() to recreate the bucket list */
1374         rt->exception_bucket_flushed = 1;
1375
1376         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1377                                     lockdep_is_held(&rt6_exception_lock));
1378         if (!bucket)
1379                 goto out;
1380
1381         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1382                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1383                         rt6_remove_exception(bucket, rt6_ex);
1384                 WARN_ON_ONCE(bucket->depth);
1385                 bucket++;
1386         }
1387
1388 out:
1389         spin_unlock_bh(&rt6_exception_lock);
1390 }
1391
1392 /* Find cached rt in the hash table inside passed in rt
1393  * Caller has to hold rcu_read_lock()
1394  */
1395 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1396                                            struct in6_addr *daddr,
1397                                            struct in6_addr *saddr)
1398 {
1399         struct rt6_exception_bucket *bucket;
1400         struct in6_addr *src_key = NULL;
1401         struct rt6_exception *rt6_ex;
1402         struct rt6_info *res = NULL;
1403
1404         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1405
1406 #ifdef CONFIG_IPV6_SUBTREES
1407         /* rt6i_src.plen != 0 indicates rt is in subtree
1408          * and exception table is indexed by a hash of
1409          * both rt6i_dst and rt6i_src.
1410          * Otherwise, the exception table is indexed by
1411          * a hash of only rt6i_dst.
1412          */
1413         if (rt->rt6i_src.plen)
1414                 src_key = saddr;
1415 #endif
1416         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1417
1418         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1419                 res = rt6_ex->rt6i;
1420
1421         return res;
1422 }
1423
1424 /* Remove the passed in cached rt from the hash table that contains it */
1425 int rt6_remove_exception_rt(struct rt6_info *rt)
1426 {
1427         struct rt6_exception_bucket *bucket;
1428         struct rt6_info *from = rt->from;
1429         struct in6_addr *src_key = NULL;
1430         struct rt6_exception *rt6_ex;
1431         int err;
1432
1433         if (!from ||
1434             !(rt->rt6i_flags & RTF_CACHE))
1435                 return -EINVAL;
1436
1437         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1438                 return -ENOENT;
1439
1440         spin_lock_bh(&rt6_exception_lock);
1441         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1442                                     lockdep_is_held(&rt6_exception_lock));
1443 #ifdef CONFIG_IPV6_SUBTREES
1444         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1445          * and exception table is indexed by a hash of
1446          * both rt6i_dst and rt6i_src.
1447          * Otherwise, the exception table is indexed by
1448          * a hash of only rt6i_dst.
1449          */
1450         if (from->rt6i_src.plen)
1451                 src_key = &rt->rt6i_src.addr;
1452 #endif
1453         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1454                                                &rt->rt6i_dst.addr,
1455                                                src_key);
1456         if (rt6_ex) {
1457                 rt6_remove_exception(bucket, rt6_ex);
1458                 err = 0;
1459         } else {
1460                 err = -ENOENT;
1461         }
1462
1463         spin_unlock_bh(&rt6_exception_lock);
1464         return err;
1465 }
1466
1467 /* Find rt6_ex which contains the passed in rt cache and
1468  * refresh its stamp
1469  */
1470 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1471 {
1472         struct rt6_exception_bucket *bucket;
1473         struct rt6_info *from = rt->from;
1474         struct in6_addr *src_key = NULL;
1475         struct rt6_exception *rt6_ex;
1476
1477         if (!from ||
1478             !(rt->rt6i_flags & RTF_CACHE))
1479                 return;
1480
1481         rcu_read_lock();
1482         bucket = rcu_dereference(from->rt6i_exception_bucket);
1483
1484 #ifdef CONFIG_IPV6_SUBTREES
1485         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1486          * and exception table is indexed by a hash of
1487          * both rt6i_dst and rt6i_src.
1488          * Otherwise, the exception table is indexed by
1489          * a hash of only rt6i_dst.
1490          */
1491         if (from->rt6i_src.plen)
1492                 src_key = &rt->rt6i_src.addr;
1493 #endif
1494         rt6_ex = __rt6_find_exception_rcu(&bucket,
1495                                           &rt->rt6i_dst.addr,
1496                                           src_key);
1497         if (rt6_ex)
1498                 rt6_ex->stamp = jiffies;
1499
1500         rcu_read_unlock();
1501 }
1502
1503 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1504 {
1505         struct rt6_exception_bucket *bucket;
1506         struct rt6_exception *rt6_ex;
1507         int i;
1508
1509         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1510                                         lockdep_is_held(&rt6_exception_lock));
1511
1512         if (bucket) {
1513                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1514                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1515                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1516                         }
1517                         bucket++;
1518                 }
1519         }
1520 }
1521
1522 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1523 {
1524         struct rt6_exception_bucket *bucket;
1525         struct rt6_exception *rt6_ex;
1526         int i;
1527
1528         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1529                                         lockdep_is_held(&rt6_exception_lock));
1530
1531         if (bucket) {
1532                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1533                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1534                                 struct rt6_info *entry = rt6_ex->rt6i;
1535                                 /* For RTF_CACHE with rt6i_pmtu == 0
1536                                  * (i.e. a redirected route),
1537                                  * the metrics of its rt->dst.from has already
1538                                  * been updated.
1539                                  */
1540                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1541                                         entry->rt6i_pmtu = mtu;
1542                         }
1543                         bucket++;
1544                 }
1545         }
1546 }
1547
1548 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1549
1550 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1551                                         struct in6_addr *gateway)
1552 {
1553         struct rt6_exception_bucket *bucket;
1554         struct rt6_exception *rt6_ex;
1555         struct hlist_node *tmp;
1556         int i;
1557
1558         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1559                 return;
1560
1561         spin_lock_bh(&rt6_exception_lock);
1562         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1563                                      lockdep_is_held(&rt6_exception_lock));
1564
1565         if (bucket) {
1566                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1567                         hlist_for_each_entry_safe(rt6_ex, tmp,
1568                                                   &bucket->chain, hlist) {
1569                                 struct rt6_info *entry = rt6_ex->rt6i;
1570
1571                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1572                                     RTF_CACHE_GATEWAY &&
1573                                     ipv6_addr_equal(gateway,
1574                                                     &entry->rt6i_gateway)) {
1575                                         rt6_remove_exception(bucket, rt6_ex);
1576                                 }
1577                         }
1578                         bucket++;
1579                 }
1580         }
1581
1582         spin_unlock_bh(&rt6_exception_lock);
1583 }
1584
1585 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1586                                       struct rt6_exception *rt6_ex,
1587                                       struct fib6_gc_args *gc_args,
1588                                       unsigned long now)
1589 {
1590         struct rt6_info *rt = rt6_ex->rt6i;
1591
1592         /* we are pruning and obsoleting aged-out and non gateway exceptions
1593          * even if others have still references to them, so that on next
1594          * dst_check() such references can be dropped.
1595          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1596          * expired, independently from their aging, as per RFC 8201 section 4
1597          */
1598         if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1599             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1600                 RT6_TRACE("aging clone %p\n", rt);
1601                 rt6_remove_exception(bucket, rt6_ex);
1602                 return;
1603         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1604                 struct neighbour *neigh;
1605                 __u8 neigh_flags = 0;
1606
1607                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1608                 if (neigh) {
1609                         neigh_flags = neigh->flags;
1610                         neigh_release(neigh);
1611                 }
1612                 if (!(neigh_flags & NTF_ROUTER)) {
1613                         RT6_TRACE("purging route %p via non-router but gateway\n",
1614                                   rt);
1615                         rt6_remove_exception(bucket, rt6_ex);
1616                         return;
1617                 }
1618         } else if (__rt6_check_expired(rt)) {
1619                 RT6_TRACE("purging expired route %p\n", rt);
1620                 rt6_remove_exception(bucket, rt6_ex);
1621                 return;
1622         }
1623         gc_args->more++;
1624 }
1625
1626 void rt6_age_exceptions(struct rt6_info *rt,
1627                         struct fib6_gc_args *gc_args,
1628                         unsigned long now)
1629 {
1630         struct rt6_exception_bucket *bucket;
1631         struct rt6_exception *rt6_ex;
1632         struct hlist_node *tmp;
1633         int i;
1634
1635         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1636                 return;
1637
1638         spin_lock_bh(&rt6_exception_lock);
1639         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1640                                     lockdep_is_held(&rt6_exception_lock));
1641
1642         if (bucket) {
1643                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1644                         hlist_for_each_entry_safe(rt6_ex, tmp,
1645                                                   &bucket->chain, hlist) {
1646                                 rt6_age_examine_exception(bucket, rt6_ex,
1647                                                           gc_args, now);
1648                         }
1649                         bucket++;
1650                 }
1651         }
1652         spin_unlock_bh(&rt6_exception_lock);
1653 }
1654
1655 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1656                                int oif, struct flowi6 *fl6, int flags)
1657 {
1658         struct fib6_node *fn, *saved_fn;
1659         struct rt6_info *rt, *rt_cache;
1660         int strict = 0;
1661
1662         strict |= flags & RT6_LOOKUP_F_IFACE;
1663         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1664         if (net->ipv6.devconf_all->forwarding == 0)
1665                 strict |= RT6_LOOKUP_F_REACHABLE;
1666
1667         rcu_read_lock();
1668
1669         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1670         saved_fn = fn;
1671
1672         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1673                 oif = 0;
1674
1675 redo_rt6_select:
1676         rt = rt6_select(net, fn, oif, strict);
1677         if (rt->rt6i_nsiblings)
1678                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1679         if (rt == net->ipv6.ip6_null_entry) {
1680                 fn = fib6_backtrack(fn, &fl6->saddr);
1681                 if (fn)
1682                         goto redo_rt6_select;
1683                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1684                         /* also consider unreachable route */
1685                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1686                         fn = saved_fn;
1687                         goto redo_rt6_select;
1688                 }
1689         }
1690
1691         /*Search through exception table */
1692         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1693         if (rt_cache)
1694                 rt = rt_cache;
1695
1696         if (rt == net->ipv6.ip6_null_entry) {
1697                 rcu_read_unlock();
1698                 dst_hold(&rt->dst);
1699                 trace_fib6_table_lookup(net, rt, table, fl6);
1700                 return rt;
1701         } else if (rt->rt6i_flags & RTF_CACHE) {
1702                 if (ip6_hold_safe(net, &rt, true)) {
1703                         dst_use_noref(&rt->dst, jiffies);
1704                         rt6_dst_from_metrics_check(rt);
1705                 }
1706                 rcu_read_unlock();
1707                 trace_fib6_table_lookup(net, rt, table, fl6);
1708                 return rt;
1709         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1710                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1711                 /* Create a RTF_CACHE clone which will not be
1712                  * owned by the fib6 tree.  It is for the special case where
1713                  * the daddr in the skb during the neighbor look-up is different
1714                  * from the fl6->daddr used to look-up route here.
1715                  */
1716
1717                 struct rt6_info *uncached_rt;
1718
1719                 if (ip6_hold_safe(net, &rt, true)) {
1720                         dst_use_noref(&rt->dst, jiffies);
1721                 } else {
1722                         rcu_read_unlock();
1723                         uncached_rt = rt;
1724                         goto uncached_rt_out;
1725                 }
1726                 rcu_read_unlock();
1727
1728                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1729                 dst_release(&rt->dst);
1730
1731                 if (uncached_rt) {
1732                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1733                          * No need for another dst_hold()
1734                          */
1735                         rt6_uncached_list_add(uncached_rt);
1736                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1737                 } else {
1738                         uncached_rt = net->ipv6.ip6_null_entry;
1739                         dst_hold(&uncached_rt->dst);
1740                 }
1741
1742 uncached_rt_out:
1743                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1744                 return uncached_rt;
1745
1746         } else {
1747                 /* Get a percpu copy */
1748
1749                 struct rt6_info *pcpu_rt;
1750
1751                 dst_use_noref(&rt->dst, jiffies);
1752                 local_bh_disable();
1753                 pcpu_rt = rt6_get_pcpu_route(rt);
1754
1755                 if (!pcpu_rt) {
1756                         /* atomic_inc_not_zero() is needed when using rcu */
1757                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1758                                 /* No dst_hold() on rt is needed because grabbing
1759                                  * rt->rt6i_ref makes sure rt can't be released.
1760                                  */
1761                                 pcpu_rt = rt6_make_pcpu_route(rt);
1762                                 rt6_release(rt);
1763                         } else {
1764                                 /* rt is already removed from tree */
1765                                 pcpu_rt = net->ipv6.ip6_null_entry;
1766                                 dst_hold(&pcpu_rt->dst);
1767                         }
1768                 }
1769                 local_bh_enable();
1770                 rcu_read_unlock();
1771                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1772                 return pcpu_rt;
1773         }
1774 }
1775 EXPORT_SYMBOL_GPL(ip6_pol_route);
1776
1777 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1778                                             struct flowi6 *fl6, int flags)
1779 {
1780         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1781 }
1782
1783 struct dst_entry *ip6_route_input_lookup(struct net *net,
1784                                          struct net_device *dev,
1785                                          struct flowi6 *fl6, int flags)
1786 {
1787         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1788                 flags |= RT6_LOOKUP_F_IFACE;
1789
1790         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1791 }
1792 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1793
1794 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1795                                   struct flow_keys *keys)
1796 {
1797         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1798         const struct ipv6hdr *key_iph = outer_iph;
1799         const struct ipv6hdr *inner_iph;
1800         const struct icmp6hdr *icmph;
1801         struct ipv6hdr _inner_iph;
1802
1803         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1804                 goto out;
1805
1806         icmph = icmp6_hdr(skb);
1807         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1808             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1809             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1810             icmph->icmp6_type != ICMPV6_PARAMPROB)
1811                 goto out;
1812
1813         inner_iph = skb_header_pointer(skb,
1814                                        skb_transport_offset(skb) + sizeof(*icmph),
1815                                        sizeof(_inner_iph), &_inner_iph);
1816         if (!inner_iph)
1817                 goto out;
1818
1819         key_iph = inner_iph;
1820 out:
1821         memset(keys, 0, sizeof(*keys));
1822         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1823         keys->addrs.v6addrs.src = key_iph->saddr;
1824         keys->addrs.v6addrs.dst = key_iph->daddr;
1825         keys->tags.flow_label = ip6_flowinfo(key_iph);
1826         keys->basic.ip_proto = key_iph->nexthdr;
1827 }
1828
1829 /* if skb is set it will be used and fl6 can be NULL */
1830 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1831 {
1832         struct flow_keys hash_keys;
1833
1834         if (skb) {
1835                 ip6_multipath_l3_keys(skb, &hash_keys);
1836                 return flow_hash_from_keys(&hash_keys);
1837         }
1838
1839         return get_hash_from_flowi6(fl6);
1840 }
1841
1842 void ip6_route_input(struct sk_buff *skb)
1843 {
1844         const struct ipv6hdr *iph = ipv6_hdr(skb);
1845         struct net *net = dev_net(skb->dev);
1846         int flags = RT6_LOOKUP_F_HAS_SADDR;
1847         struct ip_tunnel_info *tun_info;
1848         struct flowi6 fl6 = {
1849                 .flowi6_iif = skb->dev->ifindex,
1850                 .daddr = iph->daddr,
1851                 .saddr = iph->saddr,
1852                 .flowlabel = ip6_flowinfo(iph),
1853                 .flowi6_mark = skb->mark,
1854                 .flowi6_proto = iph->nexthdr,
1855         };
1856
1857         tun_info = skb_tunnel_info(skb);
1858         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1859                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1860         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1861                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1862         skb_dst_drop(skb);
1863         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1864 }
1865
1866 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1867                                              struct flowi6 *fl6, int flags)
1868 {
1869         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1870 }
1871
1872 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1873                                          struct flowi6 *fl6, int flags)
1874 {
1875         bool any_src;
1876
1877         if (rt6_need_strict(&fl6->daddr)) {
1878                 struct dst_entry *dst;
1879
1880                 dst = l3mdev_link_scope_lookup(net, fl6);
1881                 if (dst)
1882                         return dst;
1883         }
1884
1885         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1886
1887         any_src = ipv6_addr_any(&fl6->saddr);
1888         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1889             (fl6->flowi6_oif && any_src))
1890                 flags |= RT6_LOOKUP_F_IFACE;
1891
1892         if (!any_src)
1893                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1894         else if (sk)
1895                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1896
1897         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1898 }
1899 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1900
1901 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1902 {
1903         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1904         struct net_device *loopback_dev = net->loopback_dev;
1905         struct dst_entry *new = NULL;
1906
1907         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1908                        DST_OBSOLETE_DEAD, 0);
1909         if (rt) {
1910                 rt6_info_init(rt);
1911                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1912
1913                 new = &rt->dst;
1914                 new->__use = 1;
1915                 new->input = dst_discard;
1916                 new->output = dst_discard_out;
1917
1918                 dst_copy_metrics(new, &ort->dst);
1919
1920                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1921                 rt->rt6i_gateway = ort->rt6i_gateway;
1922                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1923                 rt->rt6i_metric = 0;
1924
1925                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1926 #ifdef CONFIG_IPV6_SUBTREES
1927                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1928 #endif
1929         }
1930
1931         dst_release(dst_orig);
1932         return new ? new : ERR_PTR(-ENOMEM);
1933 }
1934
1935 /*
1936  *      Destination cache support functions
1937  */
1938
1939 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1940 {
1941         if (rt->from &&
1942             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1943                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1944 }
1945
1946 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1947 {
1948         u32 rt_cookie = 0;
1949
1950         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1951                 return NULL;
1952
1953         if (rt6_check_expired(rt))
1954                 return NULL;
1955
1956         return &rt->dst;
1957 }
1958
1959 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1960 {
1961         if (!__rt6_check_expired(rt) &&
1962             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1963             rt6_check(rt->from, cookie))
1964                 return &rt->dst;
1965         else
1966                 return NULL;
1967 }
1968
1969 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1970 {
1971         struct rt6_info *rt;
1972
1973         rt = (struct rt6_info *) dst;
1974
1975         /* All IPV6 dsts are created with ->obsolete set to the value
1976          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1977          * into this function always.
1978          */
1979
1980         rt6_dst_from_metrics_check(rt);
1981
1982         if (rt->rt6i_flags & RTF_PCPU ||
1983             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1984                 return rt6_dst_from_check(rt, cookie);
1985         else
1986                 return rt6_check(rt, cookie);
1987 }
1988
1989 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1990 {
1991         struct rt6_info *rt = (struct rt6_info *) dst;
1992
1993         if (rt) {
1994                 if (rt->rt6i_flags & RTF_CACHE) {
1995                         if (rt6_check_expired(rt)) {
1996                                 ip6_del_rt(rt);
1997                                 dst = NULL;
1998                         }
1999                 } else {
2000                         dst_release(dst);
2001                         dst = NULL;
2002                 }
2003         }
2004         return dst;
2005 }
2006
2007 static void ip6_link_failure(struct sk_buff *skb)
2008 {
2009         struct rt6_info *rt;
2010
2011         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2012
2013         rt = (struct rt6_info *) skb_dst(skb);
2014         if (rt) {
2015                 if (rt->rt6i_flags & RTF_CACHE) {
2016                         if (dst_hold_safe(&rt->dst))
2017                                 ip6_del_rt(rt);
2018                 } else {
2019                         struct fib6_node *fn;
2020
2021                         rcu_read_lock();
2022                         fn = rcu_dereference(rt->rt6i_node);
2023                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2024                                 fn->fn_sernum = -1;
2025                         rcu_read_unlock();
2026                 }
2027         }
2028 }
2029
2030 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2031 {
2032         struct net *net = dev_net(rt->dst.dev);
2033
2034         rt->rt6i_flags |= RTF_MODIFIED;
2035         rt->rt6i_pmtu = mtu;
2036         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2037 }
2038
2039 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2040 {
2041         return !(rt->rt6i_flags & RTF_CACHE) &&
2042                 (rt->rt6i_flags & RTF_PCPU ||
2043                  rcu_access_pointer(rt->rt6i_node));
2044 }
2045
2046 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2047                                  const struct ipv6hdr *iph, u32 mtu)
2048 {
2049         const struct in6_addr *daddr, *saddr;
2050         struct rt6_info *rt6 = (struct rt6_info *)dst;
2051
2052         if (rt6->rt6i_flags & RTF_LOCAL)
2053                 return;
2054
2055         if (dst_metric_locked(dst, RTAX_MTU))
2056                 return;
2057
2058         if (iph) {
2059                 daddr = &iph->daddr;
2060                 saddr = &iph->saddr;
2061         } else if (sk) {
2062                 daddr = &sk->sk_v6_daddr;
2063                 saddr = &inet6_sk(sk)->saddr;
2064         } else {
2065                 daddr = NULL;
2066                 saddr = NULL;
2067         }
2068         dst_confirm_neigh(dst, daddr);
2069         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2070         if (mtu >= dst_mtu(dst))
2071                 return;
2072
2073         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2074                 rt6_do_update_pmtu(rt6, mtu);
2075                 /* update rt6_ex->stamp for cache */
2076                 if (rt6->rt6i_flags & RTF_CACHE)
2077                         rt6_update_exception_stamp_rt(rt6);
2078         } else if (daddr) {
2079                 struct rt6_info *nrt6;
2080
2081                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2082                 if (nrt6) {
2083                         rt6_do_update_pmtu(nrt6, mtu);
2084                         if (rt6_insert_exception(nrt6, rt6))
2085                                 dst_release_immediate(&nrt6->dst);
2086                 }
2087         }
2088 }
2089
2090 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2091                                struct sk_buff *skb, u32 mtu)
2092 {
2093         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2094 }
2095
2096 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2097                      int oif, u32 mark, kuid_t uid)
2098 {
2099         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2100         struct dst_entry *dst;
2101         struct flowi6 fl6;
2102
2103         memset(&fl6, 0, sizeof(fl6));
2104         fl6.flowi6_oif = oif;
2105         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2106         fl6.daddr = iph->daddr;
2107         fl6.saddr = iph->saddr;
2108         fl6.flowlabel = ip6_flowinfo(iph);
2109         fl6.flowi6_uid = uid;
2110
2111         dst = ip6_route_output(net, NULL, &fl6);
2112         if (!dst->error)
2113                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2114         dst_release(dst);
2115 }
2116 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2117
2118 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2119 {
2120         struct dst_entry *dst;
2121
2122         ip6_update_pmtu(skb, sock_net(sk), mtu,
2123                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2124
2125         dst = __sk_dst_get(sk);
2126         if (!dst || !dst->obsolete ||
2127             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2128                 return;
2129
2130         bh_lock_sock(sk);
2131         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2132                 ip6_datagram_dst_update(sk, false);
2133         bh_unlock_sock(sk);
2134 }
2135 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2136
2137 /* Handle redirects */
2138 struct ip6rd_flowi {
2139         struct flowi6 fl6;
2140         struct in6_addr gateway;
2141 };
2142
2143 static struct rt6_info *__ip6_route_redirect(struct net *net,
2144                                              struct fib6_table *table,
2145                                              struct flowi6 *fl6,
2146                                              int flags)
2147 {
2148         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2149         struct rt6_info *rt, *rt_cache;
2150         struct fib6_node *fn;
2151
2152         /* Get the "current" route for this destination and
2153          * check if the redirect has come from appropriate router.
2154          *
2155          * RFC 4861 specifies that redirects should only be
2156          * accepted if they come from the nexthop to the target.
2157          * Due to the way the routes are chosen, this notion
2158          * is a bit fuzzy and one might need to check all possible
2159          * routes.
2160          */
2161
2162         rcu_read_lock();
2163         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2164 restart:
2165         for_each_fib6_node_rt_rcu(fn) {
2166                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2167                         continue;
2168                 if (rt6_check_expired(rt))
2169                         continue;
2170                 if (rt->dst.error)
2171                         break;
2172                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2173                         continue;
2174                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2175                         continue;
2176                 /* rt_cache's gateway might be different from its 'parent'
2177                  * in the case of an ip redirect.
2178                  * So we keep searching in the exception table if the gateway
2179                  * is different.
2180                  */
2181                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2182                         rt_cache = rt6_find_cached_rt(rt,
2183                                                       &fl6->daddr,
2184                                                       &fl6->saddr);
2185                         if (rt_cache &&
2186                             ipv6_addr_equal(&rdfl->gateway,
2187                                             &rt_cache->rt6i_gateway)) {
2188                                 rt = rt_cache;
2189                                 break;
2190                         }
2191                         continue;
2192                 }
2193                 break;
2194         }
2195
2196         if (!rt)
2197                 rt = net->ipv6.ip6_null_entry;
2198         else if (rt->dst.error) {
2199                 rt = net->ipv6.ip6_null_entry;
2200                 goto out;
2201         }
2202
2203         if (rt == net->ipv6.ip6_null_entry) {
2204                 fn = fib6_backtrack(fn, &fl6->saddr);
2205                 if (fn)
2206                         goto restart;
2207         }
2208
2209 out:
2210         ip6_hold_safe(net, &rt, true);
2211
2212         rcu_read_unlock();
2213
2214         trace_fib6_table_lookup(net, rt, table, fl6);
2215         return rt;
2216 };
2217
2218 static struct dst_entry *ip6_route_redirect(struct net *net,
2219                                         const struct flowi6 *fl6,
2220                                         const struct in6_addr *gateway)
2221 {
2222         int flags = RT6_LOOKUP_F_HAS_SADDR;
2223         struct ip6rd_flowi rdfl;
2224
2225         rdfl.fl6 = *fl6;
2226         rdfl.gateway = *gateway;
2227
2228         return fib6_rule_lookup(net, &rdfl.fl6,
2229                                 flags, __ip6_route_redirect);
2230 }
2231
2232 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2233                   kuid_t uid)
2234 {
2235         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2236         struct dst_entry *dst;
2237         struct flowi6 fl6;
2238
2239         memset(&fl6, 0, sizeof(fl6));
2240         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2241         fl6.flowi6_oif = oif;
2242         fl6.flowi6_mark = mark;
2243         fl6.daddr = iph->daddr;
2244         fl6.saddr = iph->saddr;
2245         fl6.flowlabel = ip6_flowinfo(iph);
2246         fl6.flowi6_uid = uid;
2247
2248         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2249         rt6_do_redirect(dst, NULL, skb);
2250         dst_release(dst);
2251 }
2252 EXPORT_SYMBOL_GPL(ip6_redirect);
2253
2254 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2255                             u32 mark)
2256 {
2257         const struct ipv6hdr *iph = ipv6_hdr(skb);
2258         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2259         struct dst_entry *dst;
2260         struct flowi6 fl6;
2261
2262         memset(&fl6, 0, sizeof(fl6));
2263         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2264         fl6.flowi6_oif = oif;
2265         fl6.flowi6_mark = mark;
2266         fl6.daddr = msg->dest;
2267         fl6.saddr = iph->daddr;
2268         fl6.flowi6_uid = sock_net_uid(net, NULL);
2269
2270         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2271         rt6_do_redirect(dst, NULL, skb);
2272         dst_release(dst);
2273 }
2274
2275 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2276 {
2277         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2278                      sk->sk_uid);
2279 }
2280 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2281
2282 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2283 {
2284         struct net_device *dev = dst->dev;
2285         unsigned int mtu = dst_mtu(dst);
2286         struct net *net = dev_net(dev);
2287
2288         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2289
2290         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2291                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2292
2293         /*
2294          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2295          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2296          * IPV6_MAXPLEN is also valid and means: "any MSS,
2297          * rely only on pmtu discovery"
2298          */
2299         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2300                 mtu = IPV6_MAXPLEN;
2301         return mtu;
2302 }
2303
2304 static unsigned int ip6_mtu(const struct dst_entry *dst)
2305 {
2306         const struct rt6_info *rt = (const struct rt6_info *)dst;
2307         unsigned int mtu = rt->rt6i_pmtu;
2308         struct inet6_dev *idev;
2309
2310         if (mtu)
2311                 goto out;
2312
2313         mtu = dst_metric_raw(dst, RTAX_MTU);
2314         if (mtu)
2315                 goto out;
2316
2317         mtu = IPV6_MIN_MTU;
2318
2319         rcu_read_lock();
2320         idev = __in6_dev_get(dst->dev);
2321         if (idev)
2322                 mtu = idev->cnf.mtu6;
2323         rcu_read_unlock();
2324
2325 out:
2326         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2327
2328         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2329 }
2330
2331 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2332                                   struct flowi6 *fl6)
2333 {
2334         struct dst_entry *dst;
2335         struct rt6_info *rt;
2336         struct inet6_dev *idev = in6_dev_get(dev);
2337         struct net *net = dev_net(dev);
2338
2339         if (unlikely(!idev))
2340                 return ERR_PTR(-ENODEV);
2341
2342         rt = ip6_dst_alloc(net, dev, 0);
2343         if (unlikely(!rt)) {
2344                 in6_dev_put(idev);
2345                 dst = ERR_PTR(-ENOMEM);
2346                 goto out;
2347         }
2348
2349         rt->dst.flags |= DST_HOST;
2350         rt->dst.input = ip6_input;
2351         rt->dst.output  = ip6_output;
2352         rt->rt6i_gateway  = fl6->daddr;
2353         rt->rt6i_dst.addr = fl6->daddr;
2354         rt->rt6i_dst.plen = 128;
2355         rt->rt6i_idev     = idev;
2356         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2357
2358         /* Add this dst into uncached_list so that rt6_disable_ip() can
2359          * do proper release of the net_device
2360          */
2361         rt6_uncached_list_add(rt);
2362         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2363
2364         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2365
2366 out:
2367         return dst;
2368 }
2369
2370 static int ip6_dst_gc(struct dst_ops *ops)
2371 {
2372         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2373         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2374         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2375         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2376         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2377         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2378         int entries;
2379
2380         entries = dst_entries_get_fast(ops);
2381         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2382             entries <= rt_max_size)
2383                 goto out;
2384
2385         net->ipv6.ip6_rt_gc_expire++;
2386         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2387         entries = dst_entries_get_slow(ops);
2388         if (entries < ops->gc_thresh)
2389                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2390 out:
2391         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2392         return entries > rt_max_size;
2393 }
2394
2395 static int ip6_convert_metrics(struct mx6_config *mxc,
2396                                const struct fib6_config *cfg)
2397 {
2398         struct net *net = cfg->fc_nlinfo.nl_net;
2399         bool ecn_ca = false;
2400         struct nlattr *nla;
2401         int remaining;
2402         u32 *mp;
2403
2404         if (!cfg->fc_mx)
2405                 return 0;
2406
2407         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2408         if (unlikely(!mp))
2409                 return -ENOMEM;
2410
2411         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2412                 int type = nla_type(nla);
2413                 u32 val;
2414
2415                 if (!type)
2416                         continue;
2417                 if (unlikely(type > RTAX_MAX))
2418                         goto err;
2419
2420                 if (type == RTAX_CC_ALGO) {
2421                         char tmp[TCP_CA_NAME_MAX];
2422
2423                         nla_strlcpy(tmp, nla, sizeof(tmp));
2424                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2425                         if (val == TCP_CA_UNSPEC)
2426                                 goto err;
2427                 } else {
2428                         val = nla_get_u32(nla);
2429                 }
2430                 if (type == RTAX_HOPLIMIT && val > 255)
2431                         val = 255;
2432                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2433                         goto err;
2434
2435                 mp[type - 1] = val;
2436                 __set_bit(type - 1, mxc->mx_valid);
2437         }
2438
2439         if (ecn_ca) {
2440                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2441                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2442         }
2443
2444         mxc->mx = mp;
2445         return 0;
2446  err:
2447         kfree(mp);
2448         return -EINVAL;
2449 }
2450
2451 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2452                                             struct fib6_config *cfg,
2453                                             const struct in6_addr *gw_addr)
2454 {
2455         struct flowi6 fl6 = {
2456                 .flowi6_oif = cfg->fc_ifindex,
2457                 .daddr = *gw_addr,
2458                 .saddr = cfg->fc_prefsrc,
2459         };
2460         struct fib6_table *table;
2461         struct rt6_info *rt;
2462         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2463
2464         table = fib6_get_table(net, cfg->fc_table);
2465         if (!table)
2466                 return NULL;
2467
2468         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2469                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2470
2471         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2472
2473         /* if table lookup failed, fall back to full lookup */
2474         if (rt == net->ipv6.ip6_null_entry) {
2475                 ip6_rt_put(rt);
2476                 rt = NULL;
2477         }
2478
2479         return rt;
2480 }
2481
2482 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2483                                               struct netlink_ext_ack *extack)
2484 {
2485         struct net *net = cfg->fc_nlinfo.nl_net;
2486         struct rt6_info *rt = NULL;
2487         struct net_device *dev = NULL;
2488         struct inet6_dev *idev = NULL;
2489         struct fib6_table *table;
2490         int addr_type;
2491         int err = -EINVAL;
2492
2493         /* RTF_PCPU is an internal flag; can not be set by userspace */
2494         if (cfg->fc_flags & RTF_PCPU) {
2495                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2496                 goto out;
2497         }
2498
2499         /* RTF_CACHE is an internal flag; can not be set by userspace */
2500         if (cfg->fc_flags & RTF_CACHE) {
2501                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2502                 goto out;
2503         }
2504
2505         if (cfg->fc_dst_len > 128) {
2506                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2507                 goto out;
2508         }
2509         if (cfg->fc_src_len > 128) {
2510                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2511                 goto out;
2512         }
2513 #ifndef CONFIG_IPV6_SUBTREES
2514         if (cfg->fc_src_len) {
2515                 NL_SET_ERR_MSG(extack,
2516                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2517                 goto out;
2518         }
2519 #endif
2520         if (cfg->fc_ifindex) {
2521                 err = -ENODEV;
2522                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2523                 if (!dev)
2524                         goto out;
2525                 idev = in6_dev_get(dev);
2526                 if (!idev)
2527                         goto out;
2528         }
2529
2530         if (cfg->fc_metric == 0)
2531                 cfg->fc_metric = IP6_RT_PRIO_USER;
2532
2533         err = -ENOBUFS;
2534         if (cfg->fc_nlinfo.nlh &&
2535             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2536                 table = fib6_get_table(net, cfg->fc_table);
2537                 if (!table) {
2538                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2539                         table = fib6_new_table(net, cfg->fc_table);
2540                 }
2541         } else {
2542                 table = fib6_new_table(net, cfg->fc_table);
2543         }
2544
2545         if (!table)
2546                 goto out;
2547
2548         rt = ip6_dst_alloc(net, NULL,
2549                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2550
2551         if (!rt) {
2552                 err = -ENOMEM;
2553                 goto out;
2554         }
2555
2556         if (cfg->fc_flags & RTF_EXPIRES)
2557                 rt6_set_expires(rt, jiffies +
2558                                 clock_t_to_jiffies(cfg->fc_expires));
2559         else
2560                 rt6_clean_expires(rt);
2561
2562         if (cfg->fc_protocol == RTPROT_UNSPEC)
2563                 cfg->fc_protocol = RTPROT_BOOT;
2564         rt->rt6i_protocol = cfg->fc_protocol;
2565
2566         addr_type = ipv6_addr_type(&cfg->fc_dst);
2567
2568         if (addr_type & IPV6_ADDR_MULTICAST)
2569                 rt->dst.input = ip6_mc_input;
2570         else if (cfg->fc_flags & RTF_LOCAL)
2571                 rt->dst.input = ip6_input;
2572         else
2573                 rt->dst.input = ip6_forward;
2574
2575         rt->dst.output = ip6_output;
2576
2577         if (cfg->fc_encap) {
2578                 struct lwtunnel_state *lwtstate;
2579
2580                 err = lwtunnel_build_state(cfg->fc_encap_type,
2581                                            cfg->fc_encap, AF_INET6, cfg,
2582                                            &lwtstate, extack);
2583                 if (err)
2584                         goto out;
2585                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2586                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2587                         rt->dst.lwtstate->orig_output = rt->dst.output;
2588                         rt->dst.output = lwtunnel_output;
2589                 }
2590                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2591                         rt->dst.lwtstate->orig_input = rt->dst.input;
2592                         rt->dst.input = lwtunnel_input;
2593                 }
2594         }
2595
2596         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2597         rt->rt6i_dst.plen = cfg->fc_dst_len;
2598         if (rt->rt6i_dst.plen == 128)
2599                 rt->dst.flags |= DST_HOST;
2600
2601 #ifdef CONFIG_IPV6_SUBTREES
2602         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2603         rt->rt6i_src.plen = cfg->fc_src_len;
2604 #endif
2605
2606         rt->rt6i_metric = cfg->fc_metric;
2607
2608         /* We cannot add true routes via loopback here,
2609            they would result in kernel looping; promote them to reject routes
2610          */
2611         if ((cfg->fc_flags & RTF_REJECT) ||
2612             (dev && (dev->flags & IFF_LOOPBACK) &&
2613              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2614              !(cfg->fc_flags & RTF_LOCAL))) {
2615                 /* hold loopback dev/idev if we haven't done so. */
2616                 if (dev != net->loopback_dev) {
2617                         if (dev) {
2618                                 dev_put(dev);
2619                                 in6_dev_put(idev);
2620                         }
2621                         dev = net->loopback_dev;
2622                         dev_hold(dev);
2623                         idev = in6_dev_get(dev);
2624                         if (!idev) {
2625                                 err = -ENODEV;
2626                                 goto out;
2627                         }
2628                 }
2629                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2630                 switch (cfg->fc_type) {
2631                 case RTN_BLACKHOLE:
2632                         rt->dst.error = -EINVAL;
2633                         rt->dst.output = dst_discard_out;
2634                         rt->dst.input = dst_discard;
2635                         break;
2636                 case RTN_PROHIBIT:
2637                         rt->dst.error = -EACCES;
2638                         rt->dst.output = ip6_pkt_prohibit_out;
2639                         rt->dst.input = ip6_pkt_prohibit;
2640                         break;
2641                 case RTN_THROW:
2642                 case RTN_UNREACHABLE:
2643                 default:
2644                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2645                                         : (cfg->fc_type == RTN_UNREACHABLE)
2646                                         ? -EHOSTUNREACH : -ENETUNREACH;
2647                         rt->dst.output = ip6_pkt_discard_out;
2648                         rt->dst.input = ip6_pkt_discard;
2649                         break;
2650                 }
2651                 goto install_route;
2652         }
2653
2654         if (cfg->fc_flags & RTF_GATEWAY) {
2655                 const struct in6_addr *gw_addr;
2656                 int gwa_type;
2657
2658                 gw_addr = &cfg->fc_gateway;
2659                 gwa_type = ipv6_addr_type(gw_addr);
2660
2661                 /* if gw_addr is local we will fail to detect this in case
2662                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2663                  * will return already-added prefix route via interface that
2664                  * prefix route was assigned to, which might be non-loopback.
2665                  */
2666                 err = -EINVAL;
2667                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2668                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2669                                             dev : NULL, 0, 0)) {
2670                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2671                         goto out;
2672                 }
2673                 rt->rt6i_gateway = *gw_addr;
2674
2675                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2676                         struct rt6_info *grt = NULL;
2677
2678                         /* IPv6 strictly inhibits using not link-local
2679                            addresses as nexthop address.
2680                            Otherwise, router will not able to send redirects.
2681                            It is very good, but in some (rare!) circumstances
2682                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2683                            some exceptions. --ANK
2684                            We allow IPv4-mapped nexthops to support RFC4798-type
2685                            addressing
2686                          */
2687                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2688                                           IPV6_ADDR_MAPPED))) {
2689                                 NL_SET_ERR_MSG(extack,
2690                                                "Invalid gateway address");
2691                                 goto out;
2692                         }
2693
2694                         if (cfg->fc_table) {
2695                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2696
2697                                 if (grt) {
2698                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2699                                             (dev && dev != grt->dst.dev)) {
2700                                                 ip6_rt_put(grt);
2701                                                 grt = NULL;
2702                                         }
2703                                 }
2704                         }
2705
2706                         if (!grt)
2707                                 grt = rt6_lookup(net, gw_addr, NULL,
2708                                                  cfg->fc_ifindex, 1);
2709
2710                         err = -EHOSTUNREACH;
2711                         if (!grt)
2712                                 goto out;
2713                         if (dev) {
2714                                 if (dev != grt->dst.dev) {
2715                                         ip6_rt_put(grt);
2716                                         goto out;
2717                                 }
2718                         } else {
2719                                 dev = grt->dst.dev;
2720                                 idev = grt->rt6i_idev;
2721                                 dev_hold(dev);
2722                                 in6_dev_hold(grt->rt6i_idev);
2723                         }
2724                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2725                                 err = 0;
2726                         ip6_rt_put(grt);
2727
2728                         if (err)
2729                                 goto out;
2730                 }
2731                 err = -EINVAL;
2732                 if (!dev) {
2733                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2734                         goto out;
2735                 } else if (dev->flags & IFF_LOOPBACK) {
2736                         NL_SET_ERR_MSG(extack,
2737                                        "Egress device can not be loopback device for this route");
2738                         goto out;
2739                 }
2740         }
2741
2742         err = -ENODEV;
2743         if (!dev)
2744                 goto out;
2745
2746         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2747                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2748                         NL_SET_ERR_MSG(extack, "Invalid source address");
2749                         err = -EINVAL;
2750                         goto out;
2751                 }
2752                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2753                 rt->rt6i_prefsrc.plen = 128;
2754         } else
2755                 rt->rt6i_prefsrc.plen = 0;
2756
2757         rt->rt6i_flags = cfg->fc_flags;
2758
2759 install_route:
2760         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2761             !netif_carrier_ok(dev))
2762                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2763         rt->dst.dev = dev;
2764         rt->rt6i_idev = idev;
2765         rt->rt6i_table = table;
2766
2767         cfg->fc_nlinfo.nl_net = dev_net(dev);
2768
2769         return rt;
2770 out:
2771         if (dev)
2772                 dev_put(dev);
2773         if (idev)
2774                 in6_dev_put(idev);
2775         if (rt)
2776                 dst_release_immediate(&rt->dst);
2777
2778         return ERR_PTR(err);
2779 }
2780
2781 int ip6_route_add(struct fib6_config *cfg,
2782                   struct netlink_ext_ack *extack)
2783 {
2784         struct mx6_config mxc = { .mx = NULL, };
2785         struct rt6_info *rt;
2786         int err;
2787
2788         rt = ip6_route_info_create(cfg, extack);
2789         if (IS_ERR(rt)) {
2790                 err = PTR_ERR(rt);
2791                 rt = NULL;
2792                 goto out;
2793         }
2794
2795         err = ip6_convert_metrics(&mxc, cfg);
2796         if (err)
2797                 goto out;
2798
2799         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2800
2801         kfree(mxc.mx);
2802
2803         return err;
2804 out:
2805         if (rt)
2806                 dst_release_immediate(&rt->dst);
2807
2808         return err;
2809 }
2810
2811 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2812 {
2813         int err;
2814         struct fib6_table *table;
2815         struct net *net = dev_net(rt->dst.dev);
2816
2817         if (rt == net->ipv6.ip6_null_entry) {
2818                 err = -ENOENT;
2819                 goto out;
2820         }
2821
2822         table = rt->rt6i_table;
2823         spin_lock_bh(&table->tb6_lock);
2824         err = fib6_del(rt, info);
2825         spin_unlock_bh(&table->tb6_lock);
2826
2827 out:
2828         ip6_rt_put(rt);
2829         return err;
2830 }
2831
2832 int ip6_del_rt(struct rt6_info *rt)
2833 {
2834         struct nl_info info = {
2835                 .nl_net = dev_net(rt->dst.dev),
2836         };
2837         return __ip6_del_rt(rt, &info);
2838 }
2839
2840 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2841 {
2842         struct nl_info *info = &cfg->fc_nlinfo;
2843         struct net *net = info->nl_net;
2844         struct sk_buff *skb = NULL;
2845         struct fib6_table *table;
2846         int err = -ENOENT;
2847
2848         if (rt == net->ipv6.ip6_null_entry)
2849                 goto out_put;
2850         table = rt->rt6i_table;
2851         spin_lock_bh(&table->tb6_lock);
2852
2853         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2854                 struct rt6_info *sibling, *next_sibling;
2855
2856                 /* prefer to send a single notification with all hops */
2857                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2858                 if (skb) {
2859                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2860
2861                         if (rt6_fill_node(net, skb, rt,
2862                                           NULL, NULL, 0, RTM_DELROUTE,
2863                                           info->portid, seq, 0) < 0) {
2864                                 kfree_skb(skb);
2865                                 skb = NULL;
2866                         } else
2867                                 info->skip_notify = 1;
2868                 }
2869
2870                 list_for_each_entry_safe(sibling, next_sibling,
2871                                          &rt->rt6i_siblings,
2872                                          rt6i_siblings) {
2873                         err = fib6_del(sibling, info);
2874                         if (err)
2875                                 goto out_unlock;
2876                 }
2877         }
2878
2879         err = fib6_del(rt, info);
2880 out_unlock:
2881         spin_unlock_bh(&table->tb6_lock);
2882 out_put:
2883         ip6_rt_put(rt);
2884
2885         if (skb) {
2886                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2887                             info->nlh, gfp_any());
2888         }
2889         return err;
2890 }
2891
2892 static int ip6_route_del(struct fib6_config *cfg,
2893                          struct netlink_ext_ack *extack)
2894 {
2895         struct rt6_info *rt, *rt_cache;
2896         struct fib6_table *table;
2897         struct fib6_node *fn;
2898         int err = -ESRCH;
2899
2900         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2901         if (!table) {
2902                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2903                 return err;
2904         }
2905
2906         rcu_read_lock();
2907
2908         fn = fib6_locate(&table->tb6_root,
2909                          &cfg->fc_dst, cfg->fc_dst_len,
2910                          &cfg->fc_src, cfg->fc_src_len,
2911                          !(cfg->fc_flags & RTF_CACHE));
2912
2913         if (fn) {
2914                 for_each_fib6_node_rt_rcu(fn) {
2915                         if (cfg->fc_flags & RTF_CACHE) {
2916                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2917                                                               &cfg->fc_src);
2918                                 if (!rt_cache)
2919                                         continue;
2920                                 rt = rt_cache;
2921                         }
2922                         if (cfg->fc_ifindex &&
2923                             (!rt->dst.dev ||
2924                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2925                                 continue;
2926                         if (cfg->fc_flags & RTF_GATEWAY &&
2927                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2928                                 continue;
2929                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2930                                 continue;
2931                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2932                                 continue;
2933                         if (!dst_hold_safe(&rt->dst))
2934                                 break;
2935                         rcu_read_unlock();
2936
2937                         /* if gateway was specified only delete the one hop */
2938                         if (cfg->fc_flags & RTF_GATEWAY)
2939                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2940
2941                         return __ip6_del_rt_siblings(rt, cfg);
2942                 }
2943         }
2944         rcu_read_unlock();
2945
2946         return err;
2947 }
2948
2949 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2950 {
2951         struct netevent_redirect netevent;
2952         struct rt6_info *rt, *nrt = NULL;
2953         struct ndisc_options ndopts;
2954         struct inet6_dev *in6_dev;
2955         struct neighbour *neigh;
2956         struct rd_msg *msg;
2957         int optlen, on_link;
2958         u8 *lladdr;
2959
2960         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2961         optlen -= sizeof(*msg);
2962
2963         if (optlen < 0) {
2964                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2965                 return;
2966         }
2967
2968         msg = (struct rd_msg *)icmp6_hdr(skb);
2969
2970         if (ipv6_addr_is_multicast(&msg->dest)) {
2971                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2972                 return;
2973         }
2974
2975         on_link = 0;
2976         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2977                 on_link = 1;
2978         } else if (ipv6_addr_type(&msg->target) !=
2979                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2980                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2981                 return;
2982         }
2983
2984         in6_dev = __in6_dev_get(skb->dev);
2985         if (!in6_dev)
2986                 return;
2987         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2988                 return;
2989
2990         /* RFC2461 8.1:
2991          *      The IP source address of the Redirect MUST be the same as the current
2992          *      first-hop router for the specified ICMP Destination Address.
2993          */
2994
2995         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2996                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2997                 return;
2998         }
2999
3000         lladdr = NULL;
3001         if (ndopts.nd_opts_tgt_lladdr) {
3002                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3003                                              skb->dev);
3004                 if (!lladdr) {
3005                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3006                         return;
3007                 }
3008         }
3009
3010         rt = (struct rt6_info *) dst;
3011         if (rt->rt6i_flags & RTF_REJECT) {
3012                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3013                 return;
3014         }
3015
3016         /* Redirect received -> path was valid.
3017          * Look, redirects are sent only in response to data packets,
3018          * so that this nexthop apparently is reachable. --ANK
3019          */
3020         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3021
3022         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3023         if (!neigh)
3024                 return;
3025
3026         /*
3027          *      We have finally decided to accept it.
3028          */
3029
3030         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3031                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3032                      NEIGH_UPDATE_F_OVERRIDE|
3033                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3034                                      NEIGH_UPDATE_F_ISROUTER)),
3035                      NDISC_REDIRECT, &ndopts);
3036
3037         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3038         if (!nrt)
3039                 goto out;
3040
3041         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3042         if (on_link)
3043                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3044
3045         nrt->rt6i_protocol = RTPROT_REDIRECT;
3046         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3047
3048         /* No need to remove rt from the exception table if rt is
3049          * a cached route because rt6_insert_exception() will
3050          * takes care of it
3051          */
3052         if (rt6_insert_exception(nrt, rt)) {
3053                 dst_release_immediate(&nrt->dst);
3054                 goto out;
3055         }
3056
3057         netevent.old = &rt->dst;
3058         netevent.new = &nrt->dst;
3059         netevent.daddr = &msg->dest;
3060         netevent.neigh = neigh;
3061         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3062
3063 out:
3064         neigh_release(neigh);
3065 }
3066
3067 /*
3068  *      Misc support functions
3069  */
3070
3071 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3072 {
3073         BUG_ON(from->from);
3074
3075         rt->rt6i_flags &= ~RTF_EXPIRES;
3076         dst_hold(&from->dst);
3077         rt->from = from;
3078         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3079 }
3080
3081 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3082 {
3083         rt->dst.input = ort->dst.input;
3084         rt->dst.output = ort->dst.output;
3085         rt->rt6i_dst = ort->rt6i_dst;
3086         rt->dst.error = ort->dst.error;
3087         rt->rt6i_idev = ort->rt6i_idev;
3088         if (rt->rt6i_idev)
3089                 in6_dev_hold(rt->rt6i_idev);
3090         rt->dst.lastuse = jiffies;
3091         rt->rt6i_gateway = ort->rt6i_gateway;
3092         rt->rt6i_flags = ort->rt6i_flags;
3093         rt6_set_from(rt, ort);
3094         rt->rt6i_metric = ort->rt6i_metric;
3095 #ifdef CONFIG_IPV6_SUBTREES
3096         rt->rt6i_src = ort->rt6i_src;
3097 #endif
3098         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3099         rt->rt6i_table = ort->rt6i_table;
3100         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3101 }
3102
3103 #ifdef CONFIG_IPV6_ROUTE_INFO
3104 static struct rt6_info *rt6_get_route_info(struct net *net,
3105                                            const struct in6_addr *prefix, int prefixlen,
3106                                            const struct in6_addr *gwaddr,
3107                                            struct net_device *dev)
3108 {
3109         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3110         int ifindex = dev->ifindex;
3111         struct fib6_node *fn;
3112         struct rt6_info *rt = NULL;
3113         struct fib6_table *table;
3114
3115         table = fib6_get_table(net, tb_id);
3116         if (!table)
3117                 return NULL;
3118
3119         rcu_read_lock();
3120         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3121         if (!fn)
3122                 goto out;
3123
3124         for_each_fib6_node_rt_rcu(fn) {
3125                 if (rt->dst.dev->ifindex != ifindex)
3126                         continue;
3127                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3128                         continue;
3129                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3130                         continue;
3131                 ip6_hold_safe(NULL, &rt, false);
3132                 break;
3133         }
3134 out:
3135         rcu_read_unlock();
3136         return rt;
3137 }
3138
3139 static struct rt6_info *rt6_add_route_info(struct net *net,
3140                                            const struct in6_addr *prefix, int prefixlen,
3141                                            const struct in6_addr *gwaddr,
3142                                            struct net_device *dev,
3143                                            unsigned int pref)
3144 {
3145         struct fib6_config cfg = {
3146                 .fc_metric      = IP6_RT_PRIO_USER,
3147                 .fc_ifindex     = dev->ifindex,
3148                 .fc_dst_len     = prefixlen,
3149                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3150                                   RTF_UP | RTF_PREF(pref),
3151                 .fc_protocol = RTPROT_RA,
3152                 .fc_nlinfo.portid = 0,
3153                 .fc_nlinfo.nlh = NULL,
3154                 .fc_nlinfo.nl_net = net,
3155         };
3156
3157         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3158         cfg.fc_dst = *prefix;
3159         cfg.fc_gateway = *gwaddr;
3160
3161         /* We should treat it as a default route if prefix length is 0. */
3162         if (!prefixlen)
3163                 cfg.fc_flags |= RTF_DEFAULT;
3164
3165         ip6_route_add(&cfg, NULL);
3166
3167         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3168 }
3169 #endif
3170
3171 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3172 {
3173         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3174         struct rt6_info *rt;
3175         struct fib6_table *table;
3176
3177         table = fib6_get_table(dev_net(dev), tb_id);
3178         if (!table)
3179                 return NULL;
3180
3181         rcu_read_lock();
3182         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3183                 if (dev == rt->dst.dev &&
3184                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3185                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3186                         break;
3187         }
3188         if (rt)
3189                 ip6_hold_safe(NULL, &rt, false);
3190         rcu_read_unlock();
3191         return rt;
3192 }
3193
3194 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3195                                      struct net_device *dev,
3196                                      unsigned int pref)
3197 {
3198         struct fib6_config cfg = {
3199                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3200                 .fc_metric      = IP6_RT_PRIO_USER,
3201                 .fc_ifindex     = dev->ifindex,
3202                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3203                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3204                 .fc_protocol = RTPROT_RA,
3205                 .fc_nlinfo.portid = 0,
3206                 .fc_nlinfo.nlh = NULL,
3207                 .fc_nlinfo.nl_net = dev_net(dev),
3208         };
3209
3210         cfg.fc_gateway = *gwaddr;
3211
3212         if (!ip6_route_add(&cfg, NULL)) {
3213                 struct fib6_table *table;
3214
3215                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3216                 if (table)
3217                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3218         }
3219
3220         return rt6_get_dflt_router(gwaddr, dev);
3221 }
3222
3223 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3224 {
3225         struct rt6_info *rt;
3226
3227 restart:
3228         rcu_read_lock();
3229         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3230                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3231                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3232                         if (dst_hold_safe(&rt->dst)) {
3233                                 rcu_read_unlock();
3234                                 ip6_del_rt(rt);
3235                         } else {
3236                                 rcu_read_unlock();
3237                         }
3238                         goto restart;
3239                 }
3240         }
3241         rcu_read_unlock();
3242
3243         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3244 }
3245
3246 void rt6_purge_dflt_routers(struct net *net)
3247 {
3248         struct fib6_table *table;
3249         struct hlist_head *head;
3250         unsigned int h;
3251
3252         rcu_read_lock();
3253
3254         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3255                 head = &net->ipv6.fib_table_hash[h];
3256                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3257                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3258                                 __rt6_purge_dflt_routers(table);
3259                 }
3260         }
3261
3262         rcu_read_unlock();
3263 }
3264
3265 static void rtmsg_to_fib6_config(struct net *net,
3266                                  struct in6_rtmsg *rtmsg,
3267                                  struct fib6_config *cfg)
3268 {
3269         memset(cfg, 0, sizeof(*cfg));
3270
3271         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3272                          : RT6_TABLE_MAIN;
3273         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3274         cfg->fc_metric = rtmsg->rtmsg_metric;
3275         cfg->fc_expires = rtmsg->rtmsg_info;
3276         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3277         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3278         cfg->fc_flags = rtmsg->rtmsg_flags;
3279
3280         cfg->fc_nlinfo.nl_net = net;
3281
3282         cfg->fc_dst = rtmsg->rtmsg_dst;
3283         cfg->fc_src = rtmsg->rtmsg_src;
3284         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3285 }
3286
3287 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3288 {
3289         struct fib6_config cfg;
3290         struct in6_rtmsg rtmsg;
3291         int err;
3292
3293         switch (cmd) {
3294         case SIOCADDRT:         /* Add a route */
3295         case SIOCDELRT:         /* Delete a route */
3296                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3297                         return -EPERM;
3298                 err = copy_from_user(&rtmsg, arg,
3299                                      sizeof(struct in6_rtmsg));
3300                 if (err)
3301                         return -EFAULT;
3302
3303                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3304
3305                 rtnl_lock();
3306                 switch (cmd) {
3307                 case SIOCADDRT:
3308                         err = ip6_route_add(&cfg, NULL);
3309                         break;
3310                 case SIOCDELRT:
3311                         err = ip6_route_del(&cfg, NULL);
3312                         break;
3313                 default:
3314                         err = -EINVAL;
3315                 }
3316                 rtnl_unlock();
3317
3318                 return err;
3319         }
3320
3321         return -EINVAL;
3322 }
3323
3324 /*
3325  *      Drop the packet on the floor
3326  */
3327
3328 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3329 {
3330         int type;
3331         struct dst_entry *dst = skb_dst(skb);
3332         switch (ipstats_mib_noroutes) {
3333         case IPSTATS_MIB_INNOROUTES:
3334                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3335                 if (type == IPV6_ADDR_ANY) {
3336                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3337                                       IPSTATS_MIB_INADDRERRORS);
3338                         break;
3339                 }
3340                 /* FALLTHROUGH */
3341         case IPSTATS_MIB_OUTNOROUTES:
3342                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3343                               ipstats_mib_noroutes);
3344                 break;
3345         }
3346         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3347         kfree_skb(skb);
3348         return 0;
3349 }
3350
3351 static int ip6_pkt_discard(struct sk_buff *skb)
3352 {
3353         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3354 }
3355
3356 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3357 {
3358         skb->dev = skb_dst(skb)->dev;
3359         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3360 }
3361
3362 static int ip6_pkt_prohibit(struct sk_buff *skb)
3363 {
3364         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3365 }
3366
3367 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3368 {
3369         skb->dev = skb_dst(skb)->dev;
3370         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3371 }
3372
3373 /*
3374  *      Allocate a dst for local (unicast / anycast) address.
3375  */
3376
3377 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3378                                     const struct in6_addr *addr,
3379                                     bool anycast)
3380 {
3381         u32 tb_id;
3382         struct net *net = dev_net(idev->dev);
3383         struct net_device *dev = idev->dev;
3384         struct rt6_info *rt;
3385
3386         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3387         if (!rt)
3388                 return ERR_PTR(-ENOMEM);
3389
3390         in6_dev_hold(idev);
3391
3392         rt->dst.flags |= DST_HOST;
3393         rt->dst.input = ip6_input;
3394         rt->dst.output = ip6_output;
3395         rt->rt6i_idev = idev;
3396
3397         rt->rt6i_protocol = RTPROT_KERNEL;
3398         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3399         if (anycast)
3400                 rt->rt6i_flags |= RTF_ANYCAST;
3401         else
3402                 rt->rt6i_flags |= RTF_LOCAL;
3403
3404         rt->rt6i_gateway  = *addr;
3405         rt->rt6i_dst.addr = *addr;
3406         rt->rt6i_dst.plen = 128;
3407         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3408         rt->rt6i_table = fib6_get_table(net, tb_id);
3409
3410         return rt;
3411 }
3412
3413 /* remove deleted ip from prefsrc entries */
3414 struct arg_dev_net_ip {
3415         struct net_device *dev;
3416         struct net *net;
3417         struct in6_addr *addr;
3418 };
3419
3420 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3421 {
3422         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3423         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3424         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3425
3426         if (((void *)rt->dst.dev == dev || !dev) &&
3427             rt != net->ipv6.ip6_null_entry &&
3428             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3429                 spin_lock_bh(&rt6_exception_lock);
3430                 /* remove prefsrc entry */
3431                 rt->rt6i_prefsrc.plen = 0;
3432                 /* need to update cache as well */
3433                 rt6_exceptions_remove_prefsrc(rt);
3434                 spin_unlock_bh(&rt6_exception_lock);
3435         }
3436         return 0;
3437 }
3438
3439 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3440 {
3441         struct net *net = dev_net(ifp->idev->dev);
3442         struct arg_dev_net_ip adni = {
3443                 .dev = ifp->idev->dev,
3444                 .net = net,
3445                 .addr = &ifp->addr,
3446         };
3447         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3448 }
3449
3450 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3451
3452 /* Remove routers and update dst entries when gateway turn into host. */
3453 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3454 {
3455         struct in6_addr *gateway = (struct in6_addr *)arg;
3456
3457         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3458             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3459                 return -1;
3460         }
3461
3462         /* Further clean up cached routes in exception table.
3463          * This is needed because cached route may have a different
3464          * gateway than its 'parent' in the case of an ip redirect.
3465          */
3466         rt6_exceptions_clean_tohost(rt, gateway);
3467
3468         return 0;
3469 }
3470
3471 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3472 {
3473         fib6_clean_all(net, fib6_clean_tohost, gateway);
3474 }
3475
3476 struct arg_netdev_event {
3477         const struct net_device *dev;
3478         union {
3479                 unsigned int nh_flags;
3480                 unsigned long event;
3481         };
3482 };
3483
3484 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3485 {
3486         struct rt6_info *iter;
3487         struct fib6_node *fn;
3488
3489         fn = rcu_dereference_protected(rt->rt6i_node,
3490                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3491         iter = rcu_dereference_protected(fn->leaf,
3492                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3493         while (iter) {
3494                 if (iter->rt6i_metric == rt->rt6i_metric &&
3495                     rt6_qualify_for_ecmp(iter))
3496                         return iter;
3497                 iter = rcu_dereference_protected(iter->rt6_next,
3498                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3499         }
3500
3501         return NULL;
3502 }
3503
3504 static bool rt6_is_dead(const struct rt6_info *rt)
3505 {
3506         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3507             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3508              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3509                 return true;
3510
3511         return false;
3512 }
3513
3514 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3515 {
3516         struct rt6_info *iter;
3517         int total = 0;
3518
3519         if (!rt6_is_dead(rt))
3520                 total++;
3521
3522         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3523                 if (!rt6_is_dead(iter))
3524                         total++;
3525         }
3526
3527         return total;
3528 }
3529
3530 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3531 {
3532         int upper_bound = -1;
3533
3534         if (!rt6_is_dead(rt)) {
3535                 (*weight)++;
3536                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3537                                                     total) - 1;
3538         }
3539         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3540 }
3541
3542 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3543 {
3544         struct rt6_info *iter;
3545         int weight = 0;
3546
3547         rt6_upper_bound_set(rt, &weight, total);
3548
3549         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3550                 rt6_upper_bound_set(iter, &weight, total);
3551 }
3552
3553 void rt6_multipath_rebalance(struct rt6_info *rt)
3554 {
3555         struct rt6_info *first;
3556         int total;
3557
3558         /* In case the entire multipath route was marked for flushing,
3559          * then there is no need to rebalance upon the removal of every
3560          * sibling route.
3561          */
3562         if (!rt->rt6i_nsiblings || rt->should_flush)
3563                 return;
3564
3565         /* During lookup routes are evaluated in order, so we need to
3566          * make sure upper bounds are assigned from the first sibling
3567          * onwards.
3568          */
3569         first = rt6_multipath_first_sibling(rt);
3570         if (WARN_ON_ONCE(!first))
3571                 return;
3572
3573         total = rt6_multipath_total_weight(first);
3574         rt6_multipath_upper_bound_set(first, total);
3575 }
3576
3577 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3578 {
3579         const struct arg_netdev_event *arg = p_arg;
3580         const struct net *net = dev_net(arg->dev);
3581
3582         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3583                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3584                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3585                 rt6_multipath_rebalance(rt);
3586         }
3587
3588         return 0;
3589 }
3590
3591 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3592 {
3593         struct arg_netdev_event arg = {
3594                 .dev = dev,
3595                 .nh_flags = nh_flags,
3596         };
3597
3598         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3599                 arg.nh_flags |= RTNH_F_LINKDOWN;
3600
3601         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3602 }
3603
3604 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3605                                    const struct net_device *dev)
3606 {
3607         struct rt6_info *iter;
3608
3609         if (rt->dst.dev == dev)
3610                 return true;
3611         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3612                 if (iter->dst.dev == dev)
3613                         return true;
3614
3615         return false;
3616 }
3617
3618 static void rt6_multipath_flush(struct rt6_info *rt)
3619 {
3620         struct rt6_info *iter;
3621
3622         rt->should_flush = 1;
3623         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3624                 iter->should_flush = 1;
3625 }
3626
3627 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3628                                              const struct net_device *down_dev)
3629 {
3630         struct rt6_info *iter;
3631         unsigned int dead = 0;
3632
3633         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3634                 dead++;
3635         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3636                 if (iter->dst.dev == down_dev ||
3637                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3638                         dead++;
3639
3640         return dead;
3641 }
3642
3643 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3644                                        const struct net_device *dev,
3645                                        unsigned int nh_flags)
3646 {
3647         struct rt6_info *iter;
3648
3649         if (rt->dst.dev == dev)
3650                 rt->rt6i_nh_flags |= nh_flags;
3651         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3652                 if (iter->dst.dev == dev)
3653                         iter->rt6i_nh_flags |= nh_flags;
3654 }
3655
3656 /* called with write lock held for table with rt */
3657 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3658 {
3659         const struct arg_netdev_event *arg = p_arg;
3660         const struct net_device *dev = arg->dev;
3661         const struct net *net = dev_net(dev);
3662
3663         if (rt == net->ipv6.ip6_null_entry)
3664                 return 0;
3665
3666         switch (arg->event) {
3667         case NETDEV_UNREGISTER:
3668                 return rt->dst.dev == dev ? -1 : 0;
3669         case NETDEV_DOWN:
3670                 if (rt->should_flush)
3671                         return -1;
3672                 if (!rt->rt6i_nsiblings)
3673                         return rt->dst.dev == dev ? -1 : 0;
3674                 if (rt6_multipath_uses_dev(rt, dev)) {
3675                         unsigned int count;
3676
3677                         count = rt6_multipath_dead_count(rt, dev);
3678                         if (rt->rt6i_nsiblings + 1 == count) {
3679                                 rt6_multipath_flush(rt);
3680                                 return -1;
3681                         }
3682                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3683                                                    RTNH_F_LINKDOWN);
3684                         fib6_update_sernum(rt);
3685                         rt6_multipath_rebalance(rt);
3686                 }
3687                 return -2;
3688         case NETDEV_CHANGE:
3689                 if (rt->dst.dev != dev ||
3690                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3691                         break;
3692                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3693                 rt6_multipath_rebalance(rt);
3694                 break;
3695         }
3696
3697         return 0;
3698 }
3699
3700 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3701 {
3702         struct arg_netdev_event arg = {
3703                 .dev = dev,
3704                 .event = event,
3705         };
3706
3707         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3708 }
3709
3710 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3711 {
3712         rt6_sync_down_dev(dev, event);
3713         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3714         neigh_ifdown(&nd_tbl, dev);
3715 }
3716
3717 struct rt6_mtu_change_arg {
3718         struct net_device *dev;
3719         unsigned int mtu;
3720 };
3721
3722 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3723 {
3724         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3725         struct inet6_dev *idev;
3726
3727         /* In IPv6 pmtu discovery is not optional,
3728            so that RTAX_MTU lock cannot disable it.
3729            We still use this lock to block changes
3730            caused by addrconf/ndisc.
3731         */
3732
3733         idev = __in6_dev_get(arg->dev);
3734         if (!idev)
3735                 return 0;
3736
3737         /* For administrative MTU increase, there is no way to discover
3738            IPv6 PMTU increase, so PMTU increase should be updated here.
3739            Since RFC 1981 doesn't include administrative MTU increase
3740            update PMTU increase is a MUST. (i.e. jumbo frame)
3741          */
3742         /*
3743            If new MTU is less than route PMTU, this new MTU will be the
3744            lowest MTU in the path, update the route PMTU to reflect PMTU
3745            decreases; if new MTU is greater than route PMTU, and the
3746            old MTU is the lowest MTU in the path, update the route PMTU
3747            to reflect the increase. In this case if the other nodes' MTU
3748            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3749            PMTU discovery.
3750          */
3751         if (rt->dst.dev == arg->dev &&
3752             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3753             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3754                 spin_lock_bh(&rt6_exception_lock);
3755                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3756                     (dst_mtu(&rt->dst) < arg->mtu &&
3757                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3758                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3759                 }
3760                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3761                 spin_unlock_bh(&rt6_exception_lock);
3762         }
3763         return 0;
3764 }
3765
3766 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3767 {
3768         struct rt6_mtu_change_arg arg = {
3769                 .dev = dev,
3770                 .mtu = mtu,
3771         };
3772
3773         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3774 }
3775
3776 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3777         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3778         [RTA_OIF]               = { .type = NLA_U32 },
3779         [RTA_IIF]               = { .type = NLA_U32 },
3780         [RTA_PRIORITY]          = { .type = NLA_U32 },
3781         [RTA_METRICS]           = { .type = NLA_NESTED },
3782         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3783         [RTA_PREF]              = { .type = NLA_U8 },
3784         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3785         [RTA_ENCAP]             = { .type = NLA_NESTED },
3786         [RTA_EXPIRES]           = { .type = NLA_U32 },
3787         [RTA_UID]               = { .type = NLA_U32 },
3788         [RTA_MARK]              = { .type = NLA_U32 },
3789 };
3790
3791 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3792                               struct fib6_config *cfg,
3793                               struct netlink_ext_ack *extack)
3794 {
3795         struct rtmsg *rtm;
3796         struct nlattr *tb[RTA_MAX+1];
3797         unsigned int pref;
3798         int err;
3799
3800         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3801                           NULL);
3802         if (err < 0)
3803                 goto errout;
3804
3805         err = -EINVAL;
3806         rtm = nlmsg_data(nlh);
3807         memset(cfg, 0, sizeof(*cfg));
3808
3809         cfg->fc_table = rtm->rtm_table;
3810         cfg->fc_dst_len = rtm->rtm_dst_len;
3811         cfg->fc_src_len = rtm->rtm_src_len;
3812         cfg->fc_flags = RTF_UP;
3813         cfg->fc_protocol = rtm->rtm_protocol;
3814         cfg->fc_type = rtm->rtm_type;
3815
3816         if (rtm->rtm_type == RTN_UNREACHABLE ||
3817             rtm->rtm_type == RTN_BLACKHOLE ||
3818             rtm->rtm_type == RTN_PROHIBIT ||
3819             rtm->rtm_type == RTN_THROW)
3820                 cfg->fc_flags |= RTF_REJECT;
3821
3822         if (rtm->rtm_type == RTN_LOCAL)
3823                 cfg->fc_flags |= RTF_LOCAL;
3824
3825         if (rtm->rtm_flags & RTM_F_CLONED)
3826                 cfg->fc_flags |= RTF_CACHE;
3827
3828         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3829         cfg->fc_nlinfo.nlh = nlh;
3830         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3831
3832         if (tb[RTA_GATEWAY]) {
3833                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3834                 cfg->fc_flags |= RTF_GATEWAY;
3835         }
3836
3837         if (tb[RTA_DST]) {
3838                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3839
3840                 if (nla_len(tb[RTA_DST]) < plen)
3841                         goto errout;
3842
3843                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3844         }
3845
3846         if (tb[RTA_SRC]) {
3847                 int plen = (rtm->rtm_src_len + 7) >> 3;
3848
3849                 if (nla_len(tb[RTA_SRC]) < plen)
3850                         goto errout;
3851
3852                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3853         }
3854
3855         if (tb[RTA_PREFSRC])
3856                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3857
3858         if (tb[RTA_OIF])
3859                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3860
3861         if (tb[RTA_PRIORITY])
3862                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3863
3864         if (tb[RTA_METRICS]) {
3865                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3866                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3867         }
3868
3869         if (tb[RTA_TABLE])
3870                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3871
3872         if (tb[RTA_MULTIPATH]) {
3873                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3874                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3875
3876                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3877                                                      cfg->fc_mp_len, extack);
3878                 if (err < 0)
3879                         goto errout;
3880         }
3881
3882         if (tb[RTA_PREF]) {
3883                 pref = nla_get_u8(tb[RTA_PREF]);
3884                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3885                     pref != ICMPV6_ROUTER_PREF_HIGH)
3886                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3887                 cfg->fc_flags |= RTF_PREF(pref);
3888         }
3889
3890         if (tb[RTA_ENCAP])
3891                 cfg->fc_encap = tb[RTA_ENCAP];
3892
3893         if (tb[RTA_ENCAP_TYPE]) {
3894                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3895
3896                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3897                 if (err < 0)
3898                         goto errout;
3899         }
3900
3901         if (tb[RTA_EXPIRES]) {
3902                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3903
3904                 if (addrconf_finite_timeout(timeout)) {
3905                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3906                         cfg->fc_flags |= RTF_EXPIRES;
3907                 }
3908         }
3909
3910         err = 0;
3911 errout:
3912         return err;
3913 }
3914
3915 struct rt6_nh {
3916         struct rt6_info *rt6_info;
3917         struct fib6_config r_cfg;
3918         struct mx6_config mxc;
3919         struct list_head next;
3920 };
3921
3922 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3923 {
3924         struct rt6_nh *nh;
3925
3926         list_for_each_entry(nh, rt6_nh_list, next) {
3927                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3928                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3929                         nh->r_cfg.fc_ifindex);
3930         }
3931 }
3932
3933 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3934                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3935 {
3936         struct rt6_nh *nh;
3937         int err = -EEXIST;
3938
3939         list_for_each_entry(nh, rt6_nh_list, next) {
3940                 /* check if rt6_info already exists */
3941                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3942                         return err;
3943         }
3944
3945         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3946         if (!nh)
3947                 return -ENOMEM;
3948         nh->rt6_info = rt;
3949         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3950         if (err) {
3951                 kfree(nh);
3952                 return err;
3953         }
3954         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3955         list_add_tail(&nh->next, rt6_nh_list);
3956
3957         return 0;
3958 }
3959
3960 static void ip6_route_mpath_notify(struct rt6_info *rt,
3961                                    struct rt6_info *rt_last,
3962                                    struct nl_info *info,
3963                                    __u16 nlflags)
3964 {
3965         /* if this is an APPEND route, then rt points to the first route
3966          * inserted and rt_last points to last route inserted. Userspace
3967          * wants a consistent dump of the route which starts at the first
3968          * nexthop. Since sibling routes are always added at the end of
3969          * the list, find the first sibling of the last route appended
3970          */
3971         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3972                 rt = list_first_entry(&rt_last->rt6i_siblings,
3973                                       struct rt6_info,
3974                                       rt6i_siblings);
3975         }
3976
3977         if (rt)
3978                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3979 }
3980
3981 static int ip6_route_multipath_add(struct fib6_config *cfg,
3982                                    struct netlink_ext_ack *extack)
3983 {
3984         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3985         struct nl_info *info = &cfg->fc_nlinfo;
3986         struct fib6_config r_cfg;
3987         struct rtnexthop *rtnh;
3988         struct rt6_info *rt;
3989         struct rt6_nh *err_nh;
3990         struct rt6_nh *nh, *nh_safe;
3991         __u16 nlflags;
3992         int remaining;
3993         int attrlen;
3994         int err = 1;
3995         int nhn = 0;
3996         int replace = (cfg->fc_nlinfo.nlh &&
3997                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3998         LIST_HEAD(rt6_nh_list);
3999
4000         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4001         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4002                 nlflags |= NLM_F_APPEND;
4003
4004         remaining = cfg->fc_mp_len;
4005         rtnh = (struct rtnexthop *)cfg->fc_mp;
4006
4007         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4008          * rt6_info structs per nexthop
4009          */
4010         while (rtnh_ok(rtnh, remaining)) {
4011                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4012                 if (rtnh->rtnh_ifindex)
4013                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4014
4015                 attrlen = rtnh_attrlen(rtnh);
4016                 if (attrlen > 0) {
4017                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4018
4019                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4020                         if (nla) {
4021                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4022                                 r_cfg.fc_flags |= RTF_GATEWAY;
4023                         }
4024                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4025                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4026                         if (nla)
4027                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4028                 }
4029
4030                 rt = ip6_route_info_create(&r_cfg, extack);
4031                 if (IS_ERR(rt)) {
4032                         err = PTR_ERR(rt);
4033                         rt = NULL;
4034                         goto cleanup;
4035                 }
4036
4037                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4038                 if (err) {
4039                         dst_release_immediate(&rt->dst);
4040                         goto cleanup;
4041                 }
4042
4043                 rtnh = rtnh_next(rtnh, &remaining);
4044         }
4045
4046         /* for add and replace send one notification with all nexthops.
4047          * Skip the notification in fib6_add_rt2node and send one with
4048          * the full route when done
4049          */
4050         info->skip_notify = 1;
4051
4052         err_nh = NULL;
4053         list_for_each_entry(nh, &rt6_nh_list, next) {
4054                 rt_last = nh->rt6_info;
4055                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4056                 /* save reference to first route for notification */
4057                 if (!rt_notif && !err)
4058                         rt_notif = nh->rt6_info;
4059
4060                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4061                 nh->rt6_info = NULL;
4062                 if (err) {
4063                         if (replace && nhn)
4064                                 ip6_print_replace_route_err(&rt6_nh_list);
4065                         err_nh = nh;
4066                         goto add_errout;
4067                 }
4068
4069                 /* Because each route is added like a single route we remove
4070                  * these flags after the first nexthop: if there is a collision,
4071                  * we have already failed to add the first nexthop:
4072                  * fib6_add_rt2node() has rejected it; when replacing, old
4073                  * nexthops have been replaced by first new, the rest should
4074                  * be added to it.
4075                  */
4076                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4077                                                      NLM_F_REPLACE);
4078                 nhn++;
4079         }
4080
4081         /* success ... tell user about new route */
4082         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4083         goto cleanup;
4084
4085 add_errout:
4086         /* send notification for routes that were added so that
4087          * the delete notifications sent by ip6_route_del are
4088          * coherent
4089          */
4090         if (rt_notif)
4091                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4092
4093         /* Delete routes that were already added */
4094         list_for_each_entry(nh, &rt6_nh_list, next) {
4095                 if (err_nh == nh)
4096                         break;
4097                 ip6_route_del(&nh->r_cfg, extack);
4098         }
4099
4100 cleanup:
4101         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4102                 if (nh->rt6_info)
4103                         dst_release_immediate(&nh->rt6_info->dst);
4104                 kfree(nh->mxc.mx);
4105                 list_del(&nh->next);
4106                 kfree(nh);
4107         }
4108
4109         return err;
4110 }
4111
4112 static int ip6_route_multipath_del(struct fib6_config *cfg,
4113                                    struct netlink_ext_ack *extack)
4114 {
4115         struct fib6_config r_cfg;
4116         struct rtnexthop *rtnh;
4117         int remaining;
4118         int attrlen;
4119         int err = 1, last_err = 0;
4120
4121         remaining = cfg->fc_mp_len;
4122         rtnh = (struct rtnexthop *)cfg->fc_mp;
4123
4124         /* Parse a Multipath Entry */
4125         while (rtnh_ok(rtnh, remaining)) {
4126                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4127                 if (rtnh->rtnh_ifindex)
4128                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4129
4130                 attrlen = rtnh_attrlen(rtnh);
4131                 if (attrlen > 0) {
4132                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4133
4134                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4135                         if (nla) {
4136                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4137                                 r_cfg.fc_flags |= RTF_GATEWAY;
4138                         }
4139                 }
4140                 err = ip6_route_del(&r_cfg, extack);
4141                 if (err)
4142                         last_err = err;
4143
4144                 rtnh = rtnh_next(rtnh, &remaining);
4145         }
4146
4147         return last_err;
4148 }
4149
4150 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4151                               struct netlink_ext_ack *extack)
4152 {
4153         struct fib6_config cfg;
4154         int err;
4155
4156         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4157         if (err < 0)
4158                 return err;
4159
4160         if (cfg.fc_mp)
4161                 return ip6_route_multipath_del(&cfg, extack);
4162         else {
4163                 cfg.fc_delete_all_nh = 1;
4164                 return ip6_route_del(&cfg, extack);
4165         }
4166 }
4167
4168 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4169                               struct netlink_ext_ack *extack)
4170 {
4171         struct fib6_config cfg;
4172         int err;
4173
4174         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4175         if (err < 0)
4176                 return err;
4177
4178         if (cfg.fc_mp)
4179                 return ip6_route_multipath_add(&cfg, extack);
4180         else
4181                 return ip6_route_add(&cfg, extack);
4182 }
4183
4184 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4185 {
4186         int nexthop_len = 0;
4187
4188         if (rt->rt6i_nsiblings) {
4189                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4190                             + NLA_ALIGN(sizeof(struct rtnexthop))
4191                             + nla_total_size(16) /* RTA_GATEWAY */
4192                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4193
4194                 nexthop_len *= rt->rt6i_nsiblings;
4195         }
4196
4197         return NLMSG_ALIGN(sizeof(struct rtmsg))
4198                + nla_total_size(16) /* RTA_SRC */
4199                + nla_total_size(16) /* RTA_DST */
4200                + nla_total_size(16) /* RTA_GATEWAY */
4201                + nla_total_size(16) /* RTA_PREFSRC */
4202                + nla_total_size(4) /* RTA_TABLE */
4203                + nla_total_size(4) /* RTA_IIF */
4204                + nla_total_size(4) /* RTA_OIF */
4205                + nla_total_size(4) /* RTA_PRIORITY */
4206                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4207                + nla_total_size(sizeof(struct rta_cacheinfo))
4208                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4209                + nla_total_size(1) /* RTA_PREF */
4210                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4211                + nexthop_len;
4212 }
4213
4214 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4215                             unsigned int *flags, bool skip_oif)
4216 {
4217         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4218                 *flags |= RTNH_F_DEAD;
4219
4220         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4221                 *flags |= RTNH_F_LINKDOWN;
4222                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4223                         *flags |= RTNH_F_DEAD;
4224         }
4225
4226         if (rt->rt6i_flags & RTF_GATEWAY) {
4227                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4228                         goto nla_put_failure;
4229         }
4230
4231         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4232                 *flags |= RTNH_F_OFFLOAD;
4233
4234         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4235         if (!skip_oif && rt->dst.dev &&
4236             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4237                 goto nla_put_failure;
4238
4239         if (rt->dst.lwtstate &&
4240             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4241                 goto nla_put_failure;
4242
4243         return 0;
4244
4245 nla_put_failure:
4246         return -EMSGSIZE;
4247 }
4248
4249 /* add multipath next hop */
4250 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4251 {
4252         struct rtnexthop *rtnh;
4253         unsigned int flags = 0;
4254
4255         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4256         if (!rtnh)
4257                 goto nla_put_failure;
4258
4259         rtnh->rtnh_hops = 0;
4260         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4261
4262         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4263                 goto nla_put_failure;
4264
4265         rtnh->rtnh_flags = flags;
4266
4267         /* length of rtnetlink header + attributes */
4268         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4269
4270         return 0;
4271
4272 nla_put_failure:
4273         return -EMSGSIZE;
4274 }
4275
4276 static int rt6_fill_node(struct net *net,
4277                          struct sk_buff *skb, struct rt6_info *rt,
4278                          struct in6_addr *dst, struct in6_addr *src,
4279                          int iif, int type, u32 portid, u32 seq,
4280                          unsigned int flags)
4281 {
4282         u32 metrics[RTAX_MAX];
4283         struct rtmsg *rtm;
4284         struct nlmsghdr *nlh;
4285         long expires;
4286         u32 table;
4287
4288         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4289         if (!nlh)
4290                 return -EMSGSIZE;
4291
4292         rtm = nlmsg_data(nlh);
4293         rtm->rtm_family = AF_INET6;
4294         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4295         rtm->rtm_src_len = rt->rt6i_src.plen;
4296         rtm->rtm_tos = 0;
4297         if (rt->rt6i_table)
4298                 table = rt->rt6i_table->tb6_id;
4299         else
4300                 table = RT6_TABLE_UNSPEC;
4301         rtm->rtm_table = table;
4302         if (nla_put_u32(skb, RTA_TABLE, table))
4303                 goto nla_put_failure;
4304         if (rt->rt6i_flags & RTF_REJECT) {
4305                 switch (rt->dst.error) {
4306                 case -EINVAL:
4307                         rtm->rtm_type = RTN_BLACKHOLE;
4308                         break;
4309                 case -EACCES:
4310                         rtm->rtm_type = RTN_PROHIBIT;
4311                         break;
4312                 case -EAGAIN:
4313                         rtm->rtm_type = RTN_THROW;
4314                         break;
4315                 default:
4316                         rtm->rtm_type = RTN_UNREACHABLE;
4317                         break;
4318                 }
4319         }
4320         else if (rt->rt6i_flags & RTF_LOCAL)
4321                 rtm->rtm_type = RTN_LOCAL;
4322         else if (rt->rt6i_flags & RTF_ANYCAST)
4323                 rtm->rtm_type = RTN_ANYCAST;
4324         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4325                 rtm->rtm_type = RTN_LOCAL;
4326         else
4327                 rtm->rtm_type = RTN_UNICAST;
4328         rtm->rtm_flags = 0;
4329         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4330         rtm->rtm_protocol = rt->rt6i_protocol;
4331
4332         if (rt->rt6i_flags & RTF_CACHE)
4333                 rtm->rtm_flags |= RTM_F_CLONED;
4334
4335         if (dst) {
4336                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4337                         goto nla_put_failure;
4338                 rtm->rtm_dst_len = 128;
4339         } else if (rtm->rtm_dst_len)
4340                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4341                         goto nla_put_failure;
4342 #ifdef CONFIG_IPV6_SUBTREES
4343         if (src) {
4344                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4345                         goto nla_put_failure;
4346                 rtm->rtm_src_len = 128;
4347         } else if (rtm->rtm_src_len &&
4348                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4349                 goto nla_put_failure;
4350 #endif
4351         if (iif) {
4352 #ifdef CONFIG_IPV6_MROUTE
4353                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4354                         int err = ip6mr_get_route(net, skb, rtm, portid);
4355
4356                         if (err == 0)
4357                                 return 0;
4358                         if (err < 0)
4359                                 goto nla_put_failure;
4360                 } else
4361 #endif
4362                         if (nla_put_u32(skb, RTA_IIF, iif))
4363                                 goto nla_put_failure;
4364         } else if (dst) {
4365                 struct in6_addr saddr_buf;
4366                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4367                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4368                         goto nla_put_failure;
4369         }
4370
4371         if (rt->rt6i_prefsrc.plen) {
4372                 struct in6_addr saddr_buf;
4373                 saddr_buf = rt->rt6i_prefsrc.addr;
4374                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4375                         goto nla_put_failure;
4376         }
4377
4378         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4379         if (rt->rt6i_pmtu)
4380                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4381         if (rtnetlink_put_metrics(skb, metrics) < 0)
4382                 goto nla_put_failure;
4383
4384         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4385                 goto nla_put_failure;
4386
4387         /* For multipath routes, walk the siblings list and add
4388          * each as a nexthop within RTA_MULTIPATH.
4389          */
4390         if (rt->rt6i_nsiblings) {
4391                 struct rt6_info *sibling, *next_sibling;
4392                 struct nlattr *mp;
4393
4394                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4395                 if (!mp)
4396                         goto nla_put_failure;
4397
4398                 if (rt6_add_nexthop(skb, rt) < 0)
4399                         goto nla_put_failure;
4400
4401                 list_for_each_entry_safe(sibling, next_sibling,
4402                                          &rt->rt6i_siblings, rt6i_siblings) {
4403                         if (rt6_add_nexthop(skb, sibling) < 0)
4404                                 goto nla_put_failure;
4405                 }
4406
4407                 nla_nest_end(skb, mp);
4408         } else {
4409                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4410                         goto nla_put_failure;
4411         }
4412
4413         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4414
4415         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4416                 goto nla_put_failure;
4417
4418         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4419                 goto nla_put_failure;
4420
4421
4422         nlmsg_end(skb, nlh);
4423         return 0;
4424
4425 nla_put_failure:
4426         nlmsg_cancel(skb, nlh);
4427         return -EMSGSIZE;
4428 }
4429
4430 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4431 {
4432         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4433         struct net *net = arg->net;
4434
4435         if (rt == net->ipv6.ip6_null_entry)
4436                 return 0;
4437
4438         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4439                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4440
4441                 /* user wants prefix routes only */
4442                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4443                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4444                         /* success since this is not a prefix route */
4445                         return 1;
4446                 }
4447         }
4448
4449         return rt6_fill_node(net,
4450                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4451                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4452                      NLM_F_MULTI);
4453 }
4454
4455 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4456                               struct netlink_ext_ack *extack)
4457 {
4458         struct net *net = sock_net(in_skb->sk);
4459         struct nlattr *tb[RTA_MAX+1];
4460         int err, iif = 0, oif = 0;
4461         struct dst_entry *dst;
4462         struct rt6_info *rt;
4463         struct sk_buff *skb;
4464         struct rtmsg *rtm;
4465         struct flowi6 fl6;
4466         bool fibmatch;
4467
4468         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4469                           extack);
4470         if (err < 0)
4471                 goto errout;
4472
4473         err = -EINVAL;
4474         memset(&fl6, 0, sizeof(fl6));
4475         rtm = nlmsg_data(nlh);
4476         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4477         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4478
4479         if (tb[RTA_SRC]) {
4480                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4481                         goto errout;
4482
4483                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4484         }
4485
4486         if (tb[RTA_DST]) {
4487                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4488                         goto errout;
4489
4490                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4491         }
4492
4493         if (tb[RTA_IIF])
4494                 iif = nla_get_u32(tb[RTA_IIF]);
4495
4496         if (tb[RTA_OIF])
4497                 oif = nla_get_u32(tb[RTA_OIF]);
4498
4499         if (tb[RTA_MARK])
4500                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4501
4502         if (tb[RTA_UID])
4503                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4504                                            nla_get_u32(tb[RTA_UID]));
4505         else
4506                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4507
4508         if (iif) {
4509                 struct net_device *dev;
4510                 int flags = 0;
4511
4512                 rcu_read_lock();
4513
4514                 dev = dev_get_by_index_rcu(net, iif);
4515                 if (!dev) {
4516                         rcu_read_unlock();
4517                         err = -ENODEV;
4518                         goto errout;
4519                 }
4520
4521                 fl6.flowi6_iif = iif;
4522
4523                 if (!ipv6_addr_any(&fl6.saddr))
4524                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4525
4526                 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4527
4528                 rcu_read_unlock();
4529         } else {
4530                 fl6.flowi6_oif = oif;
4531
4532                 dst = ip6_route_output(net, NULL, &fl6);
4533         }
4534
4535
4536         rt = container_of(dst, struct rt6_info, dst);
4537         if (rt->dst.error) {
4538                 err = rt->dst.error;
4539                 ip6_rt_put(rt);
4540                 goto errout;
4541         }
4542
4543         if (rt == net->ipv6.ip6_null_entry) {
4544                 err = rt->dst.error;
4545                 ip6_rt_put(rt);
4546                 goto errout;
4547         }
4548
4549         if (fibmatch && rt->from) {
4550                 struct rt6_info *ort = rt->from;
4551
4552                 dst_hold(&ort->dst);
4553                 ip6_rt_put(rt);
4554                 rt = ort;
4555         }
4556
4557         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4558         if (!skb) {
4559                 ip6_rt_put(rt);
4560                 err = -ENOBUFS;
4561                 goto errout;
4562         }
4563
4564         skb_dst_set(skb, &rt->dst);
4565         if (fibmatch)
4566                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4567                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4568                                     nlh->nlmsg_seq, 0);
4569         else
4570                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4571                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4572                                     nlh->nlmsg_seq, 0);
4573         if (err < 0) {
4574                 kfree_skb(skb);
4575                 goto errout;
4576         }
4577
4578         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4579 errout:
4580         return err;
4581 }
4582
4583 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4584                      unsigned int nlm_flags)
4585 {
4586         struct sk_buff *skb;
4587         struct net *net = info->nl_net;
4588         u32 seq;
4589         int err;
4590
4591         err = -ENOBUFS;
4592         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4593
4594         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4595         if (!skb)
4596                 goto errout;
4597
4598         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4599                                 event, info->portid, seq, nlm_flags);
4600         if (err < 0) {
4601                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4602                 WARN_ON(err == -EMSGSIZE);
4603                 kfree_skb(skb);
4604                 goto errout;
4605         }
4606         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4607                     info->nlh, gfp_any());
4608         return;
4609 errout:
4610         if (err < 0)
4611                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4612 }
4613
4614 static int ip6_route_dev_notify(struct notifier_block *this,
4615                                 unsigned long event, void *ptr)
4616 {
4617         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4618         struct net *net = dev_net(dev);
4619
4620         if (!(dev->flags & IFF_LOOPBACK))
4621                 return NOTIFY_OK;
4622
4623         if (event == NETDEV_REGISTER) {
4624                 net->ipv6.ip6_null_entry->dst.dev = dev;
4625                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4626 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4627                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4628                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4629                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4630                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4631 #endif
4632          } else if (event == NETDEV_UNREGISTER &&
4633                     dev->reg_state != NETREG_UNREGISTERED) {
4634                 /* NETDEV_UNREGISTER could be fired for multiple times by
4635                  * netdev_wait_allrefs(). Make sure we only call this once.
4636                  */
4637                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4638 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4639                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4640                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4641 #endif
4642         }
4643
4644         return NOTIFY_OK;
4645 }
4646
4647 /*
4648  *      /proc
4649  */
4650
4651 #ifdef CONFIG_PROC_FS
4652
4653 static const struct file_operations ipv6_route_proc_fops = {
4654         .owner          = THIS_MODULE,
4655         .open           = ipv6_route_open,
4656         .read           = seq_read,
4657         .llseek         = seq_lseek,
4658         .release        = seq_release_net,
4659 };
4660
4661 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4662 {
4663         struct net *net = (struct net *)seq->private;
4664         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4665                    net->ipv6.rt6_stats->fib_nodes,
4666                    net->ipv6.rt6_stats->fib_route_nodes,
4667                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4668                    net->ipv6.rt6_stats->fib_rt_entries,
4669                    net->ipv6.rt6_stats->fib_rt_cache,
4670                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4671                    net->ipv6.rt6_stats->fib_discarded_routes);
4672
4673         return 0;
4674 }
4675
4676 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4677 {
4678         return single_open_net(inode, file, rt6_stats_seq_show);
4679 }
4680
4681 static const struct file_operations rt6_stats_seq_fops = {
4682         .owner   = THIS_MODULE,
4683         .open    = rt6_stats_seq_open,
4684         .read    = seq_read,
4685         .llseek  = seq_lseek,
4686         .release = single_release_net,
4687 };
4688 #endif  /* CONFIG_PROC_FS */
4689
4690 #ifdef CONFIG_SYSCTL
4691
4692 static
4693 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4694                               void __user *buffer, size_t *lenp, loff_t *ppos)
4695 {
4696         struct net *net;
4697         int delay;
4698         if (!write)
4699                 return -EINVAL;
4700
4701         net = (struct net *)ctl->extra1;
4702         delay = net->ipv6.sysctl.flush_delay;
4703         proc_dointvec(ctl, write, buffer, lenp, ppos);
4704         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4705         return 0;
4706 }
4707
4708 struct ctl_table ipv6_route_table_template[] = {
4709         {
4710                 .procname       =       "flush",
4711                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4712                 .maxlen         =       sizeof(int),
4713                 .mode           =       0200,
4714                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4715         },
4716         {
4717                 .procname       =       "gc_thresh",
4718                 .data           =       &ip6_dst_ops_template.gc_thresh,
4719                 .maxlen         =       sizeof(int),
4720                 .mode           =       0644,
4721                 .proc_handler   =       proc_dointvec,
4722         },
4723         {
4724                 .procname       =       "max_size",
4725                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4726                 .maxlen         =       sizeof(int),
4727                 .mode           =       0644,
4728                 .proc_handler   =       proc_dointvec,
4729         },
4730         {
4731                 .procname       =       "gc_min_interval",
4732                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4733                 .maxlen         =       sizeof(int),
4734                 .mode           =       0644,
4735                 .proc_handler   =       proc_dointvec_jiffies,
4736         },
4737         {
4738                 .procname       =       "gc_timeout",
4739                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4740                 .maxlen         =       sizeof(int),
4741                 .mode           =       0644,
4742                 .proc_handler   =       proc_dointvec_jiffies,
4743         },
4744         {
4745                 .procname       =       "gc_interval",
4746                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4747                 .maxlen         =       sizeof(int),
4748                 .mode           =       0644,
4749                 .proc_handler   =       proc_dointvec_jiffies,
4750         },
4751         {
4752                 .procname       =       "gc_elasticity",
4753                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4754                 .maxlen         =       sizeof(int),
4755                 .mode           =       0644,
4756                 .proc_handler   =       proc_dointvec,
4757         },
4758         {
4759                 .procname       =       "mtu_expires",
4760                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4761                 .maxlen         =       sizeof(int),
4762                 .mode           =       0644,
4763                 .proc_handler   =       proc_dointvec_jiffies,
4764         },
4765         {
4766                 .procname       =       "min_adv_mss",
4767                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4768                 .maxlen         =       sizeof(int),
4769                 .mode           =       0644,
4770                 .proc_handler   =       proc_dointvec,
4771         },
4772         {
4773                 .procname       =       "gc_min_interval_ms",
4774                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4775                 .maxlen         =       sizeof(int),
4776                 .mode           =       0644,
4777                 .proc_handler   =       proc_dointvec_ms_jiffies,
4778         },
4779         { }
4780 };
4781
4782 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4783 {
4784         struct ctl_table *table;
4785
4786         table = kmemdup(ipv6_route_table_template,
4787                         sizeof(ipv6_route_table_template),
4788                         GFP_KERNEL);
4789
4790         if (table) {
4791                 table[0].data = &net->ipv6.sysctl.flush_delay;
4792                 table[0].extra1 = net;
4793                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4794                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4795                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4796                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4797                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4798                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4799                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4800                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4801                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4802
4803                 /* Don't export sysctls to unprivileged users */
4804                 if (net->user_ns != &init_user_ns)
4805                         table[0].procname = NULL;
4806         }
4807
4808         return table;
4809 }
4810 #endif
4811
4812 static int __net_init ip6_route_net_init(struct net *net)
4813 {
4814         int ret = -ENOMEM;
4815
4816         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4817                sizeof(net->ipv6.ip6_dst_ops));
4818
4819         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4820                 goto out_ip6_dst_ops;
4821
4822         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4823                                            sizeof(*net->ipv6.ip6_null_entry),
4824                                            GFP_KERNEL);
4825         if (!net->ipv6.ip6_null_entry)
4826                 goto out_ip6_dst_entries;
4827         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4828         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4829                          ip6_template_metrics, true);
4830
4831 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4832         net->ipv6.fib6_has_custom_rules = false;
4833         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4834                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4835                                                GFP_KERNEL);
4836         if (!net->ipv6.ip6_prohibit_entry)
4837                 goto out_ip6_null_entry;
4838         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4839         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4840                          ip6_template_metrics, true);
4841
4842         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4843                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4844                                                GFP_KERNEL);
4845         if (!net->ipv6.ip6_blk_hole_entry)
4846                 goto out_ip6_prohibit_entry;
4847         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4848         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4849                          ip6_template_metrics, true);
4850 #endif
4851
4852         net->ipv6.sysctl.flush_delay = 0;
4853         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4854         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4855         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4856         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4857         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4858         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4859         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4860
4861         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4862
4863         ret = 0;
4864 out:
4865         return ret;
4866
4867 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4868 out_ip6_prohibit_entry:
4869         kfree(net->ipv6.ip6_prohibit_entry);
4870 out_ip6_null_entry:
4871         kfree(net->ipv6.ip6_null_entry);
4872 #endif
4873 out_ip6_dst_entries:
4874         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4875 out_ip6_dst_ops:
4876         goto out;
4877 }
4878
4879 static void __net_exit ip6_route_net_exit(struct net *net)
4880 {
4881         kfree(net->ipv6.ip6_null_entry);
4882 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4883         kfree(net->ipv6.ip6_prohibit_entry);
4884         kfree(net->ipv6.ip6_blk_hole_entry);
4885 #endif
4886         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4887 }
4888
4889 static int __net_init ip6_route_net_init_late(struct net *net)
4890 {
4891 #ifdef CONFIG_PROC_FS
4892         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4893         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4894 #endif
4895         return 0;
4896 }
4897
4898 static void __net_exit ip6_route_net_exit_late(struct net *net)
4899 {
4900 #ifdef CONFIG_PROC_FS
4901         remove_proc_entry("ipv6_route", net->proc_net);
4902         remove_proc_entry("rt6_stats", net->proc_net);
4903 #endif
4904 }
4905
4906 static struct pernet_operations ip6_route_net_ops = {
4907         .init = ip6_route_net_init,
4908         .exit = ip6_route_net_exit,
4909 };
4910
4911 static int __net_init ipv6_inetpeer_init(struct net *net)
4912 {
4913         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4914
4915         if (!bp)
4916                 return -ENOMEM;
4917         inet_peer_base_init(bp);
4918         net->ipv6.peers = bp;
4919         return 0;
4920 }
4921
4922 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4923 {
4924         struct inet_peer_base *bp = net->ipv6.peers;
4925
4926         net->ipv6.peers = NULL;
4927         inetpeer_invalidate_tree(bp);
4928         kfree(bp);
4929 }
4930
4931 static struct pernet_operations ipv6_inetpeer_ops = {
4932         .init   =       ipv6_inetpeer_init,
4933         .exit   =       ipv6_inetpeer_exit,
4934 };
4935
4936 static struct pernet_operations ip6_route_net_late_ops = {
4937         .init = ip6_route_net_init_late,
4938         .exit = ip6_route_net_exit_late,
4939 };
4940
4941 static struct notifier_block ip6_route_dev_notifier = {
4942         .notifier_call = ip6_route_dev_notify,
4943         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4944 };
4945
4946 void __init ip6_route_init_special_entries(void)
4947 {
4948         /* Registering of the loopback is done before this portion of code,
4949          * the loopback reference in rt6_info will not be taken, do it
4950          * manually for init_net */
4951         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4952         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4953   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4954         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4955         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4956         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4957         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4958   #endif
4959 }
4960
4961 int __init ip6_route_init(void)
4962 {
4963         int ret;
4964         int cpu;
4965
4966         ret = -ENOMEM;
4967         ip6_dst_ops_template.kmem_cachep =
4968                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4969                                   SLAB_HWCACHE_ALIGN, NULL);
4970         if (!ip6_dst_ops_template.kmem_cachep)
4971                 goto out;
4972
4973         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4974         if (ret)
4975                 goto out_kmem_cache;
4976
4977         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4978         if (ret)
4979                 goto out_dst_entries;
4980
4981         ret = register_pernet_subsys(&ip6_route_net_ops);
4982         if (ret)
4983                 goto out_register_inetpeer;
4984
4985         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4986
4987         ret = fib6_init();
4988         if (ret)
4989                 goto out_register_subsys;
4990
4991         ret = xfrm6_init();
4992         if (ret)
4993                 goto out_fib6_init;
4994
4995         ret = fib6_rules_init();
4996         if (ret)
4997                 goto xfrm6_init;
4998
4999         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5000         if (ret)
5001                 goto fib6_rules_init;
5002
5003         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5004                                    inet6_rtm_newroute, NULL, 0);
5005         if (ret < 0)
5006                 goto out_register_late_subsys;
5007
5008         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5009                                    inet6_rtm_delroute, NULL, 0);
5010         if (ret < 0)
5011                 goto out_register_late_subsys;
5012
5013         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5014                                    inet6_rtm_getroute, NULL,
5015                                    RTNL_FLAG_DOIT_UNLOCKED);
5016         if (ret < 0)
5017                 goto out_register_late_subsys;
5018
5019         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5020         if (ret)
5021                 goto out_register_late_subsys;
5022
5023         for_each_possible_cpu(cpu) {
5024                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5025
5026                 INIT_LIST_HEAD(&ul->head);
5027                 spin_lock_init(&ul->lock);
5028         }
5029
5030 out:
5031         return ret;
5032
5033 out_register_late_subsys:
5034         rtnl_unregister_all(PF_INET6);
5035         unregister_pernet_subsys(&ip6_route_net_late_ops);
5036 fib6_rules_init:
5037         fib6_rules_cleanup();
5038 xfrm6_init:
5039         xfrm6_fini();
5040 out_fib6_init:
5041         fib6_gc_cleanup();
5042 out_register_subsys:
5043         unregister_pernet_subsys(&ip6_route_net_ops);
5044 out_register_inetpeer:
5045         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5046 out_dst_entries:
5047         dst_entries_destroy(&ip6_dst_blackhole_ops);
5048 out_kmem_cache:
5049         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5050         goto out;
5051 }
5052
5053 void ip6_route_cleanup(void)
5054 {
5055         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5056         unregister_pernet_subsys(&ip6_route_net_late_ops);
5057         fib6_rules_cleanup();
5058         xfrm6_fini();
5059         fib6_gc_cleanup();
5060         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5061         unregister_pernet_subsys(&ip6_route_net_ops);
5062         dst_entries_destroy(&ip6_dst_blackhole_ops);
5063         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5064 }