cf61dddfea09f5f19099437e0dbee4ad08a023af
[platform/kernel/linux-rpi.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            const struct in6_addr *daddr,
114                                            const struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
375                 kfree(p);
376
377         rt6_uncached_list_del(rt);
378
379         idev = rt->rt6i_idev;
380         if (idev) {
381                 rt->rt6i_idev = NULL;
382                 in6_dev_put(idev);
383         }
384
385         from = xchg((__force struct fib6_info **)&rt->from, NULL);
386         fib6_info_release(from);
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981         if (from->fib6_metrics != &dst_default_metrics) {
982                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983                 refcount_inc(&from->fib6_metrics->refcnt);
984         }
985 }
986
987 /* Caller must already hold reference to @ort */
988 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
989 {
990         struct net_device *dev = fib6_info_nh_dev(ort);
991
992         ip6_rt_init_dst(rt, ort);
993
994         rt->rt6i_dst = ort->fib6_dst;
995         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
996         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
997         rt->rt6i_flags = ort->fib6_flags;
998         rt6_set_from(rt, ort);
999 #ifdef CONFIG_IPV6_SUBTREES
1000         rt->rt6i_src = ort->fib6_src;
1001 #endif
1002         rt->rt6i_prefsrc = ort->fib6_prefsrc;
1003 }
1004
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006                                         struct in6_addr *saddr)
1007 {
1008         struct fib6_node *pn, *sn;
1009         while (1) {
1010                 if (fn->fn_flags & RTN_TL_ROOT)
1011                         return NULL;
1012                 pn = rcu_dereference(fn->parent);
1013                 sn = FIB6_SUBTREE(pn);
1014                 if (sn && sn != fn)
1015                         fn = fib6_node_lookup(sn, NULL, saddr);
1016                 else
1017                         fn = pn;
1018                 if (fn->fn_flags & RTN_RTINFO)
1019                         return fn;
1020         }
1021 }
1022
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1024                           bool null_fallback)
1025 {
1026         struct rt6_info *rt = *prt;
1027
1028         if (dst_hold_safe(&rt->dst))
1029                 return true;
1030         if (null_fallback) {
1031                 rt = net->ipv6.ip6_null_entry;
1032                 dst_hold(&rt->dst);
1033         } else {
1034                 rt = NULL;
1035         }
1036         *prt = rt;
1037         return false;
1038 }
1039
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1042 {
1043         unsigned short flags = fib6_info_dst_flags(rt);
1044         struct net_device *dev = rt->fib6_nh.nh_dev;
1045         struct rt6_info *nrt;
1046
1047         if (!fib6_info_hold_safe(rt))
1048                 goto fallback;
1049
1050         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1051         if (!nrt) {
1052                 fib6_info_release(rt);
1053                 goto fallback;
1054         }
1055
1056         ip6_rt_copy_init(nrt, rt);
1057         return nrt;
1058
1059 fallback:
1060         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1061         dst_hold(&nrt->dst);
1062         return nrt;
1063 }
1064
1065 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1066                                              struct fib6_table *table,
1067                                              struct flowi6 *fl6,
1068                                              const struct sk_buff *skb,
1069                                              int flags)
1070 {
1071         struct fib6_info *f6i;
1072         struct fib6_node *fn;
1073         struct rt6_info *rt;
1074
1075         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1076                 flags &= ~RT6_LOOKUP_F_IFACE;
1077
1078         rcu_read_lock();
1079         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1080 restart:
1081         f6i = rcu_dereference(fn->leaf);
1082         if (!f6i) {
1083                 f6i = net->ipv6.fib6_null_entry;
1084         } else {
1085                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1086                                       fl6->flowi6_oif, flags);
1087                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1088                         f6i = fib6_multipath_select(net, f6i, fl6,
1089                                                     fl6->flowi6_oif, skb,
1090                                                     flags);
1091         }
1092         if (f6i == net->ipv6.fib6_null_entry) {
1093                 fn = fib6_backtrack(fn, &fl6->saddr);
1094                 if (fn)
1095                         goto restart;
1096         }
1097
1098         trace_fib6_table_lookup(net, f6i, table, fl6);
1099
1100         /* Search through exception table */
1101         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1102         if (rt) {
1103                 if (ip6_hold_safe(net, &rt, true))
1104                         dst_use_noref(&rt->dst, jiffies);
1105         } else if (f6i == net->ipv6.fib6_null_entry) {
1106                 rt = net->ipv6.ip6_null_entry;
1107                 dst_hold(&rt->dst);
1108         } else {
1109                 rt = ip6_create_rt_rcu(f6i);
1110         }
1111
1112         rcu_read_unlock();
1113
1114         return rt;
1115 }
1116
1117 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1118                                    const struct sk_buff *skb, int flags)
1119 {
1120         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1121 }
1122 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1123
1124 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1125                             const struct in6_addr *saddr, int oif,
1126                             const struct sk_buff *skb, int strict)
1127 {
1128         struct flowi6 fl6 = {
1129                 .flowi6_oif = oif,
1130                 .daddr = *daddr,
1131         };
1132         struct dst_entry *dst;
1133         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1134
1135         if (saddr) {
1136                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1137                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1138         }
1139
1140         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1141         if (dst->error == 0)
1142                 return (struct rt6_info *) dst;
1143
1144         dst_release(dst);
1145
1146         return NULL;
1147 }
1148 EXPORT_SYMBOL(rt6_lookup);
1149
1150 /* ip6_ins_rt is called with FREE table->tb6_lock.
1151  * It takes new route entry, the addition fails by any reason the
1152  * route is released.
1153  * Caller must hold dst before calling it.
1154  */
1155
1156 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1157                         struct netlink_ext_ack *extack)
1158 {
1159         int err;
1160         struct fib6_table *table;
1161
1162         table = rt->fib6_table;
1163         spin_lock_bh(&table->tb6_lock);
1164         err = fib6_add(&table->tb6_root, rt, info, extack);
1165         spin_unlock_bh(&table->tb6_lock);
1166
1167         return err;
1168 }
1169
1170 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1171 {
1172         struct nl_info info = { .nl_net = net, };
1173
1174         return __ip6_ins_rt(rt, &info, NULL);
1175 }
1176
1177 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1178                                            const struct in6_addr *daddr,
1179                                            const struct in6_addr *saddr)
1180 {
1181         struct net_device *dev;
1182         struct rt6_info *rt;
1183
1184         /*
1185          *      Clone the route.
1186          */
1187
1188         if (!fib6_info_hold_safe(ort))
1189                 return NULL;
1190
1191         dev = ip6_rt_get_dev_rcu(ort);
1192         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1193         if (!rt) {
1194                 fib6_info_release(ort);
1195                 return NULL;
1196         }
1197
1198         ip6_rt_copy_init(rt, ort);
1199         rt->rt6i_flags |= RTF_CACHE;
1200         rt->dst.flags |= DST_HOST;
1201         rt->rt6i_dst.addr = *daddr;
1202         rt->rt6i_dst.plen = 128;
1203
1204         if (!rt6_is_gw_or_nonexthop(ort)) {
1205                 if (ort->fib6_dst.plen != 128 &&
1206                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1207                         rt->rt6i_flags |= RTF_ANYCAST;
1208 #ifdef CONFIG_IPV6_SUBTREES
1209                 if (rt->rt6i_src.plen && saddr) {
1210                         rt->rt6i_src.addr = *saddr;
1211                         rt->rt6i_src.plen = 128;
1212                 }
1213 #endif
1214         }
1215
1216         return rt;
1217 }
1218
1219 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1220 {
1221         unsigned short flags = fib6_info_dst_flags(rt);
1222         struct net_device *dev;
1223         struct rt6_info *pcpu_rt;
1224
1225         if (!fib6_info_hold_safe(rt))
1226                 return NULL;
1227
1228         rcu_read_lock();
1229         dev = ip6_rt_get_dev_rcu(rt);
1230         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1231         rcu_read_unlock();
1232         if (!pcpu_rt) {
1233                 fib6_info_release(rt);
1234                 return NULL;
1235         }
1236         ip6_rt_copy_init(pcpu_rt, rt);
1237         pcpu_rt->rt6i_flags |= RTF_PCPU;
1238         return pcpu_rt;
1239 }
1240
1241 /* It should be called with rcu_read_lock() acquired */
1242 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1243 {
1244         struct rt6_info *pcpu_rt, **p;
1245
1246         p = this_cpu_ptr(rt->rt6i_pcpu);
1247         pcpu_rt = *p;
1248
1249         if (pcpu_rt)
1250                 ip6_hold_safe(NULL, &pcpu_rt, false);
1251
1252         return pcpu_rt;
1253 }
1254
1255 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1256                                             struct fib6_info *rt)
1257 {
1258         struct rt6_info *pcpu_rt, *prev, **p;
1259
1260         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1261         if (!pcpu_rt) {
1262                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1263                 return net->ipv6.ip6_null_entry;
1264         }
1265
1266         dst_hold(&pcpu_rt->dst);
1267         p = this_cpu_ptr(rt->rt6i_pcpu);
1268         prev = cmpxchg(p, NULL, pcpu_rt);
1269         BUG_ON(prev);
1270
1271         return pcpu_rt;
1272 }
1273
1274 /* exception hash table implementation
1275  */
1276 static DEFINE_SPINLOCK(rt6_exception_lock);
1277
1278 /* Remove rt6_ex from hash table and free the memory
1279  * Caller must hold rt6_exception_lock
1280  */
1281 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1282                                  struct rt6_exception *rt6_ex)
1283 {
1284         struct fib6_info *from;
1285         struct net *net;
1286
1287         if (!bucket || !rt6_ex)
1288                 return;
1289
1290         net = dev_net(rt6_ex->rt6i->dst.dev);
1291         net->ipv6.rt6_stats->fib_rt_cache--;
1292
1293         /* purge completely the exception to allow releasing the held resources:
1294          * some [sk] cache may keep the dst around for unlimited time
1295          */
1296         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1297         fib6_info_release(from);
1298         dst_dev_put(&rt6_ex->rt6i->dst);
1299
1300         hlist_del_rcu(&rt6_ex->hlist);
1301         dst_release(&rt6_ex->rt6i->dst);
1302         kfree_rcu(rt6_ex, rcu);
1303         WARN_ON_ONCE(!bucket->depth);
1304         bucket->depth--;
1305 }
1306
1307 /* Remove oldest rt6_ex in bucket and free the memory
1308  * Caller must hold rt6_exception_lock
1309  */
1310 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1311 {
1312         struct rt6_exception *rt6_ex, *oldest = NULL;
1313
1314         if (!bucket)
1315                 return;
1316
1317         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1318                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1319                         oldest = rt6_ex;
1320         }
1321         rt6_remove_exception(bucket, oldest);
1322 }
1323
1324 static u32 rt6_exception_hash(const struct in6_addr *dst,
1325                               const struct in6_addr *src)
1326 {
1327         static u32 seed __read_mostly;
1328         u32 val;
1329
1330         net_get_random_once(&seed, sizeof(seed));
1331         val = jhash(dst, sizeof(*dst), seed);
1332
1333 #ifdef CONFIG_IPV6_SUBTREES
1334         if (src)
1335                 val = jhash(src, sizeof(*src), val);
1336 #endif
1337         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1338 }
1339
1340 /* Helper function to find the cached rt in the hash table
1341  * and update bucket pointer to point to the bucket for this
1342  * (daddr, saddr) pair
1343  * Caller must hold rt6_exception_lock
1344  */
1345 static struct rt6_exception *
1346 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1347                               const struct in6_addr *daddr,
1348                               const struct in6_addr *saddr)
1349 {
1350         struct rt6_exception *rt6_ex;
1351         u32 hval;
1352
1353         if (!(*bucket) || !daddr)
1354                 return NULL;
1355
1356         hval = rt6_exception_hash(daddr, saddr);
1357         *bucket += hval;
1358
1359         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1360                 struct rt6_info *rt6 = rt6_ex->rt6i;
1361                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1362
1363 #ifdef CONFIG_IPV6_SUBTREES
1364                 if (matched && saddr)
1365                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1366 #endif
1367                 if (matched)
1368                         return rt6_ex;
1369         }
1370         return NULL;
1371 }
1372
1373 /* Helper function to find the cached rt in the hash table
1374  * and update bucket pointer to point to the bucket for this
1375  * (daddr, saddr) pair
1376  * Caller must hold rcu_read_lock()
1377  */
1378 static struct rt6_exception *
1379 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1380                          const struct in6_addr *daddr,
1381                          const struct in6_addr *saddr)
1382 {
1383         struct rt6_exception *rt6_ex;
1384         u32 hval;
1385
1386         WARN_ON_ONCE(!rcu_read_lock_held());
1387
1388         if (!(*bucket) || !daddr)
1389                 return NULL;
1390
1391         hval = rt6_exception_hash(daddr, saddr);
1392         *bucket += hval;
1393
1394         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1395                 struct rt6_info *rt6 = rt6_ex->rt6i;
1396                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1397
1398 #ifdef CONFIG_IPV6_SUBTREES
1399                 if (matched && saddr)
1400                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1401 #endif
1402                 if (matched)
1403                         return rt6_ex;
1404         }
1405         return NULL;
1406 }
1407
1408 static unsigned int fib6_mtu(const struct fib6_info *rt)
1409 {
1410         unsigned int mtu;
1411
1412         if (rt->fib6_pmtu) {
1413                 mtu = rt->fib6_pmtu;
1414         } else {
1415                 struct net_device *dev = fib6_info_nh_dev(rt);
1416                 struct inet6_dev *idev;
1417
1418                 rcu_read_lock();
1419                 idev = __in6_dev_get(dev);
1420                 mtu = idev->cnf.mtu6;
1421                 rcu_read_unlock();
1422         }
1423
1424         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1425
1426         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1427 }
1428
1429 static int rt6_insert_exception(struct rt6_info *nrt,
1430                                 struct fib6_info *ort)
1431 {
1432         struct net *net = dev_net(nrt->dst.dev);
1433         struct rt6_exception_bucket *bucket;
1434         struct in6_addr *src_key = NULL;
1435         struct rt6_exception *rt6_ex;
1436         int err = 0;
1437
1438         spin_lock_bh(&rt6_exception_lock);
1439
1440         if (ort->exception_bucket_flushed) {
1441                 err = -EINVAL;
1442                 goto out;
1443         }
1444
1445         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1446                                         lockdep_is_held(&rt6_exception_lock));
1447         if (!bucket) {
1448                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1449                                  GFP_ATOMIC);
1450                 if (!bucket) {
1451                         err = -ENOMEM;
1452                         goto out;
1453                 }
1454                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1455         }
1456
1457 #ifdef CONFIG_IPV6_SUBTREES
1458         /* rt6i_src.plen != 0 indicates ort is in subtree
1459          * and exception table is indexed by a hash of
1460          * both rt6i_dst and rt6i_src.
1461          * Otherwise, the exception table is indexed by
1462          * a hash of only rt6i_dst.
1463          */
1464         if (ort->fib6_src.plen)
1465                 src_key = &nrt->rt6i_src.addr;
1466 #endif
1467
1468         /* Update rt6i_prefsrc as it could be changed
1469          * in rt6_remove_prefsrc()
1470          */
1471         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1472         /* rt6_mtu_change() might lower mtu on ort.
1473          * Only insert this exception route if its mtu
1474          * is less than ort's mtu value.
1475          */
1476         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1477                 err = -EINVAL;
1478                 goto out;
1479         }
1480
1481         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1482                                                src_key);
1483         if (rt6_ex)
1484                 rt6_remove_exception(bucket, rt6_ex);
1485
1486         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1487         if (!rt6_ex) {
1488                 err = -ENOMEM;
1489                 goto out;
1490         }
1491         rt6_ex->rt6i = nrt;
1492         rt6_ex->stamp = jiffies;
1493         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1494         bucket->depth++;
1495         net->ipv6.rt6_stats->fib_rt_cache++;
1496
1497         if (bucket->depth > FIB6_MAX_DEPTH)
1498                 rt6_exception_remove_oldest(bucket);
1499
1500 out:
1501         spin_unlock_bh(&rt6_exception_lock);
1502
1503         /* Update fn->fn_sernum to invalidate all cached dst */
1504         if (!err) {
1505                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1506                 fib6_update_sernum(net, ort);
1507                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1508                 fib6_force_start_gc(net);
1509         }
1510
1511         return err;
1512 }
1513
1514 void rt6_flush_exceptions(struct fib6_info *rt)
1515 {
1516         struct rt6_exception_bucket *bucket;
1517         struct rt6_exception *rt6_ex;
1518         struct hlist_node *tmp;
1519         int i;
1520
1521         spin_lock_bh(&rt6_exception_lock);
1522         /* Prevent rt6_insert_exception() to recreate the bucket list */
1523         rt->exception_bucket_flushed = 1;
1524
1525         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1526                                     lockdep_is_held(&rt6_exception_lock));
1527         if (!bucket)
1528                 goto out;
1529
1530         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1531                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1532                         rt6_remove_exception(bucket, rt6_ex);
1533                 WARN_ON_ONCE(bucket->depth);
1534                 bucket++;
1535         }
1536
1537 out:
1538         spin_unlock_bh(&rt6_exception_lock);
1539 }
1540
1541 /* Find cached rt in the hash table inside passed in rt
1542  * Caller has to hold rcu_read_lock()
1543  */
1544 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1545                                            const struct in6_addr *daddr,
1546                                            const struct in6_addr *saddr)
1547 {
1548         const struct in6_addr *src_key = NULL;
1549         struct rt6_exception_bucket *bucket;
1550         struct rt6_exception *rt6_ex;
1551         struct rt6_info *res = NULL;
1552
1553 #ifdef CONFIG_IPV6_SUBTREES
1554         /* rt6i_src.plen != 0 indicates rt is in subtree
1555          * and exception table is indexed by a hash of
1556          * both rt6i_dst and rt6i_src.
1557          * However, the src addr used to create the hash
1558          * might not be exactly the passed in saddr which
1559          * is a /128 addr from the flow.
1560          * So we need to use f6i->fib6_src to redo lookup
1561          * if the passed in saddr does not find anything.
1562          * (See the logic in ip6_rt_cache_alloc() on how
1563          * rt->rt6i_src is updated.)
1564          */
1565         if (rt->fib6_src.plen)
1566                 src_key = saddr;
1567 find_ex:
1568 #endif
1569         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1570         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1571
1572         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1573                 res = rt6_ex->rt6i;
1574
1575 #ifdef CONFIG_IPV6_SUBTREES
1576         /* Use fib6_src as src_key and redo lookup */
1577         if (!res && src_key && src_key != &rt->fib6_src.addr) {
1578                 src_key = &rt->fib6_src.addr;
1579                 goto find_ex;
1580         }
1581 #endif
1582
1583         return res;
1584 }
1585
1586 /* Remove the passed in cached rt from the hash table that contains it */
1587 static int rt6_remove_exception_rt(struct rt6_info *rt)
1588 {
1589         struct rt6_exception_bucket *bucket;
1590         struct in6_addr *src_key = NULL;
1591         struct rt6_exception *rt6_ex;
1592         struct fib6_info *from;
1593         int err;
1594
1595         from = rcu_dereference(rt->from);
1596         if (!from ||
1597             !(rt->rt6i_flags & RTF_CACHE))
1598                 return -EINVAL;
1599
1600         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1601                 return -ENOENT;
1602
1603         spin_lock_bh(&rt6_exception_lock);
1604         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1605                                     lockdep_is_held(&rt6_exception_lock));
1606 #ifdef CONFIG_IPV6_SUBTREES
1607         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1608          * and exception table is indexed by a hash of
1609          * both rt6i_dst and rt6i_src.
1610          * Otherwise, the exception table is indexed by
1611          * a hash of only rt6i_dst.
1612          */
1613         if (from->fib6_src.plen)
1614                 src_key = &rt->rt6i_src.addr;
1615 #endif
1616         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1617                                                &rt->rt6i_dst.addr,
1618                                                src_key);
1619         if (rt6_ex) {
1620                 rt6_remove_exception(bucket, rt6_ex);
1621                 err = 0;
1622         } else {
1623                 err = -ENOENT;
1624         }
1625
1626         spin_unlock_bh(&rt6_exception_lock);
1627         return err;
1628 }
1629
1630 /* Find rt6_ex which contains the passed in rt cache and
1631  * refresh its stamp
1632  */
1633 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1634 {
1635         struct rt6_exception_bucket *bucket;
1636         struct in6_addr *src_key = NULL;
1637         struct rt6_exception *rt6_ex;
1638         struct fib6_info *from;
1639
1640         rcu_read_lock();
1641         from = rcu_dereference(rt->from);
1642         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1643                 goto unlock;
1644
1645         bucket = rcu_dereference(from->rt6i_exception_bucket);
1646
1647 #ifdef CONFIG_IPV6_SUBTREES
1648         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1649          * and exception table is indexed by a hash of
1650          * both rt6i_dst and rt6i_src.
1651          * Otherwise, the exception table is indexed by
1652          * a hash of only rt6i_dst.
1653          */
1654         if (from->fib6_src.plen)
1655                 src_key = &rt->rt6i_src.addr;
1656 #endif
1657         rt6_ex = __rt6_find_exception_rcu(&bucket,
1658                                           &rt->rt6i_dst.addr,
1659                                           src_key);
1660         if (rt6_ex)
1661                 rt6_ex->stamp = jiffies;
1662
1663 unlock:
1664         rcu_read_unlock();
1665 }
1666
1667 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1668 {
1669         struct rt6_exception_bucket *bucket;
1670         struct rt6_exception *rt6_ex;
1671         int i;
1672
1673         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1674                                         lockdep_is_held(&rt6_exception_lock));
1675
1676         if (bucket) {
1677                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1678                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1679                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1680                         }
1681                         bucket++;
1682                 }
1683         }
1684 }
1685
1686 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1687                                          struct rt6_info *rt, int mtu)
1688 {
1689         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1690          * lowest MTU in the path: always allow updating the route PMTU to
1691          * reflect PMTU decreases.
1692          *
1693          * If the new MTU is higher, and the route PMTU is equal to the local
1694          * MTU, this means the old MTU is the lowest in the path, so allow
1695          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1696          * handle this.
1697          */
1698
1699         if (dst_mtu(&rt->dst) >= mtu)
1700                 return true;
1701
1702         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1703                 return true;
1704
1705         return false;
1706 }
1707
1708 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1709                                        struct fib6_info *rt, int mtu)
1710 {
1711         struct rt6_exception_bucket *bucket;
1712         struct rt6_exception *rt6_ex;
1713         int i;
1714
1715         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1716                                         lockdep_is_held(&rt6_exception_lock));
1717
1718         if (!bucket)
1719                 return;
1720
1721         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1722                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1723                         struct rt6_info *entry = rt6_ex->rt6i;
1724
1725                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1726                          * route), the metrics of its rt->from have already
1727                          * been updated.
1728                          */
1729                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1730                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1731                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1732                 }
1733                 bucket++;
1734         }
1735 }
1736
1737 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1738
1739 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1740                                         struct in6_addr *gateway)
1741 {
1742         struct rt6_exception_bucket *bucket;
1743         struct rt6_exception *rt6_ex;
1744         struct hlist_node *tmp;
1745         int i;
1746
1747         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1748                 return;
1749
1750         spin_lock_bh(&rt6_exception_lock);
1751         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1752                                      lockdep_is_held(&rt6_exception_lock));
1753
1754         if (bucket) {
1755                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1756                         hlist_for_each_entry_safe(rt6_ex, tmp,
1757                                                   &bucket->chain, hlist) {
1758                                 struct rt6_info *entry = rt6_ex->rt6i;
1759
1760                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1761                                     RTF_CACHE_GATEWAY &&
1762                                     ipv6_addr_equal(gateway,
1763                                                     &entry->rt6i_gateway)) {
1764                                         rt6_remove_exception(bucket, rt6_ex);
1765                                 }
1766                         }
1767                         bucket++;
1768                 }
1769         }
1770
1771         spin_unlock_bh(&rt6_exception_lock);
1772 }
1773
1774 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1775                                       struct rt6_exception *rt6_ex,
1776                                       struct fib6_gc_args *gc_args,
1777                                       unsigned long now)
1778 {
1779         struct rt6_info *rt = rt6_ex->rt6i;
1780
1781         /* we are pruning and obsoleting aged-out and non gateway exceptions
1782          * even if others have still references to them, so that on next
1783          * dst_check() such references can be dropped.
1784          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1785          * expired, independently from their aging, as per RFC 8201 section 4
1786          */
1787         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1788                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1789                         RT6_TRACE("aging clone %p\n", rt);
1790                         rt6_remove_exception(bucket, rt6_ex);
1791                         return;
1792                 }
1793         } else if (time_after(jiffies, rt->dst.expires)) {
1794                 RT6_TRACE("purging expired route %p\n", rt);
1795                 rt6_remove_exception(bucket, rt6_ex);
1796                 return;
1797         }
1798
1799         if (rt->rt6i_flags & RTF_GATEWAY) {
1800                 struct neighbour *neigh;
1801                 __u8 neigh_flags = 0;
1802
1803                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1804                 if (neigh)
1805                         neigh_flags = neigh->flags;
1806
1807                 if (!(neigh_flags & NTF_ROUTER)) {
1808                         RT6_TRACE("purging route %p via non-router but gateway\n",
1809                                   rt);
1810                         rt6_remove_exception(bucket, rt6_ex);
1811                         return;
1812                 }
1813         }
1814
1815         gc_args->more++;
1816 }
1817
1818 void rt6_age_exceptions(struct fib6_info *rt,
1819                         struct fib6_gc_args *gc_args,
1820                         unsigned long now)
1821 {
1822         struct rt6_exception_bucket *bucket;
1823         struct rt6_exception *rt6_ex;
1824         struct hlist_node *tmp;
1825         int i;
1826
1827         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1828                 return;
1829
1830         rcu_read_lock_bh();
1831         spin_lock(&rt6_exception_lock);
1832         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1833                                     lockdep_is_held(&rt6_exception_lock));
1834
1835         if (bucket) {
1836                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1837                         hlist_for_each_entry_safe(rt6_ex, tmp,
1838                                                   &bucket->chain, hlist) {
1839                                 rt6_age_examine_exception(bucket, rt6_ex,
1840                                                           gc_args, now);
1841                         }
1842                         bucket++;
1843                 }
1844         }
1845         spin_unlock(&rt6_exception_lock);
1846         rcu_read_unlock_bh();
1847 }
1848
1849 /* must be called with rcu lock held */
1850 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1851                                     int oif, struct flowi6 *fl6, int strict)
1852 {
1853         struct fib6_node *fn, *saved_fn;
1854         struct fib6_info *f6i;
1855
1856         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1857         saved_fn = fn;
1858
1859         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1860                 oif = 0;
1861
1862 redo_rt6_select:
1863         f6i = rt6_select(net, fn, oif, strict);
1864         if (f6i == net->ipv6.fib6_null_entry) {
1865                 fn = fib6_backtrack(fn, &fl6->saddr);
1866                 if (fn)
1867                         goto redo_rt6_select;
1868                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1869                         /* also consider unreachable route */
1870                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1871                         fn = saved_fn;
1872                         goto redo_rt6_select;
1873                 }
1874         }
1875
1876         trace_fib6_table_lookup(net, f6i, table, fl6);
1877
1878         return f6i;
1879 }
1880
1881 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1882                                int oif, struct flowi6 *fl6,
1883                                const struct sk_buff *skb, int flags)
1884 {
1885         struct fib6_info *f6i;
1886         struct rt6_info *rt;
1887         int strict = 0;
1888
1889         strict |= flags & RT6_LOOKUP_F_IFACE;
1890         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1891         if (net->ipv6.devconf_all->forwarding == 0)
1892                 strict |= RT6_LOOKUP_F_REACHABLE;
1893
1894         rcu_read_lock();
1895
1896         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1897         if (f6i->fib6_nsiblings)
1898                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1899
1900         if (f6i == net->ipv6.fib6_null_entry) {
1901                 rt = net->ipv6.ip6_null_entry;
1902                 rcu_read_unlock();
1903                 dst_hold(&rt->dst);
1904                 return rt;
1905         }
1906
1907         /*Search through exception table */
1908         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1909         if (rt) {
1910                 if (ip6_hold_safe(net, &rt, true))
1911                         dst_use_noref(&rt->dst, jiffies);
1912
1913                 rcu_read_unlock();
1914                 return rt;
1915         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1916                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1917                 /* Create a RTF_CACHE clone which will not be
1918                  * owned by the fib6 tree.  It is for the special case where
1919                  * the daddr in the skb during the neighbor look-up is different
1920                  * from the fl6->daddr used to look-up route here.
1921                  */
1922                 struct rt6_info *uncached_rt;
1923
1924                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1925
1926                 rcu_read_unlock();
1927
1928                 if (uncached_rt) {
1929                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1930                          * No need for another dst_hold()
1931                          */
1932                         rt6_uncached_list_add(uncached_rt);
1933                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1934                 } else {
1935                         uncached_rt = net->ipv6.ip6_null_entry;
1936                         dst_hold(&uncached_rt->dst);
1937                 }
1938
1939                 return uncached_rt;
1940         } else {
1941                 /* Get a percpu copy */
1942
1943                 struct rt6_info *pcpu_rt;
1944
1945                 local_bh_disable();
1946                 pcpu_rt = rt6_get_pcpu_route(f6i);
1947
1948                 if (!pcpu_rt)
1949                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1950
1951                 local_bh_enable();
1952                 rcu_read_unlock();
1953
1954                 return pcpu_rt;
1955         }
1956 }
1957 EXPORT_SYMBOL_GPL(ip6_pol_route);
1958
1959 static struct rt6_info *ip6_pol_route_input(struct net *net,
1960                                             struct fib6_table *table,
1961                                             struct flowi6 *fl6,
1962                                             const struct sk_buff *skb,
1963                                             int flags)
1964 {
1965         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1966 }
1967
1968 struct dst_entry *ip6_route_input_lookup(struct net *net,
1969                                          struct net_device *dev,
1970                                          struct flowi6 *fl6,
1971                                          const struct sk_buff *skb,
1972                                          int flags)
1973 {
1974         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1975                 flags |= RT6_LOOKUP_F_IFACE;
1976
1977         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1978 }
1979 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1980
1981 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1982                                   struct flow_keys *keys,
1983                                   struct flow_keys *flkeys)
1984 {
1985         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1986         const struct ipv6hdr *key_iph = outer_iph;
1987         struct flow_keys *_flkeys = flkeys;
1988         const struct ipv6hdr *inner_iph;
1989         const struct icmp6hdr *icmph;
1990         struct ipv6hdr _inner_iph;
1991         struct icmp6hdr _icmph;
1992
1993         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1994                 goto out;
1995
1996         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1997                                    sizeof(_icmph), &_icmph);
1998         if (!icmph)
1999                 goto out;
2000
2001         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2002             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2003             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2004             icmph->icmp6_type != ICMPV6_PARAMPROB)
2005                 goto out;
2006
2007         inner_iph = skb_header_pointer(skb,
2008                                        skb_transport_offset(skb) + sizeof(*icmph),
2009                                        sizeof(_inner_iph), &_inner_iph);
2010         if (!inner_iph)
2011                 goto out;
2012
2013         key_iph = inner_iph;
2014         _flkeys = NULL;
2015 out:
2016         if (_flkeys) {
2017                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2018                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2019                 keys->tags.flow_label = _flkeys->tags.flow_label;
2020                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2021         } else {
2022                 keys->addrs.v6addrs.src = key_iph->saddr;
2023                 keys->addrs.v6addrs.dst = key_iph->daddr;
2024                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2025                 keys->basic.ip_proto = key_iph->nexthdr;
2026         }
2027 }
2028
2029 /* if skb is set it will be used and fl6 can be NULL */
2030 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2031                        const struct sk_buff *skb, struct flow_keys *flkeys)
2032 {
2033         struct flow_keys hash_keys;
2034         u32 mhash;
2035
2036         switch (ip6_multipath_hash_policy(net)) {
2037         case 0:
2038                 memset(&hash_keys, 0, sizeof(hash_keys));
2039                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2040                 if (skb) {
2041                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2042                 } else {
2043                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2044                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2045                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2046                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2047                 }
2048                 break;
2049         case 1:
2050                 if (skb) {
2051                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2052                         struct flow_keys keys;
2053
2054                         /* short-circuit if we already have L4 hash present */
2055                         if (skb->l4_hash)
2056                                 return skb_get_hash_raw(skb) >> 1;
2057
2058                         memset(&hash_keys, 0, sizeof(hash_keys));
2059
2060                         if (!flkeys) {
2061                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2062                                 flkeys = &keys;
2063                         }
2064                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2065                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2066                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2067                         hash_keys.ports.src = flkeys->ports.src;
2068                         hash_keys.ports.dst = flkeys->ports.dst;
2069                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2070                 } else {
2071                         memset(&hash_keys, 0, sizeof(hash_keys));
2072                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2073                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2074                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2075                         hash_keys.ports.src = fl6->fl6_sport;
2076                         hash_keys.ports.dst = fl6->fl6_dport;
2077                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2078                 }
2079                 break;
2080         }
2081         mhash = flow_hash_from_keys(&hash_keys);
2082
2083         return mhash >> 1;
2084 }
2085
2086 void ip6_route_input(struct sk_buff *skb)
2087 {
2088         const struct ipv6hdr *iph = ipv6_hdr(skb);
2089         struct net *net = dev_net(skb->dev);
2090         int flags = RT6_LOOKUP_F_HAS_SADDR;
2091         struct ip_tunnel_info *tun_info;
2092         struct flowi6 fl6 = {
2093                 .flowi6_iif = skb->dev->ifindex,
2094                 .daddr = iph->daddr,
2095                 .saddr = iph->saddr,
2096                 .flowlabel = ip6_flowinfo(iph),
2097                 .flowi6_mark = skb->mark,
2098                 .flowi6_proto = iph->nexthdr,
2099         };
2100         struct flow_keys *flkeys = NULL, _flkeys;
2101
2102         tun_info = skb_tunnel_info(skb);
2103         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2104                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2105
2106         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2107                 flkeys = &_flkeys;
2108
2109         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2110                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2111         skb_dst_drop(skb);
2112         skb_dst_set(skb,
2113                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2114 }
2115
2116 static struct rt6_info *ip6_pol_route_output(struct net *net,
2117                                              struct fib6_table *table,
2118                                              struct flowi6 *fl6,
2119                                              const struct sk_buff *skb,
2120                                              int flags)
2121 {
2122         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2123 }
2124
2125 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2126                                          struct flowi6 *fl6, int flags)
2127 {
2128         bool any_src;
2129
2130         if (rt6_need_strict(&fl6->daddr)) {
2131                 struct dst_entry *dst;
2132
2133                 dst = l3mdev_link_scope_lookup(net, fl6);
2134                 if (dst)
2135                         return dst;
2136         }
2137
2138         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2139
2140         any_src = ipv6_addr_any(&fl6->saddr);
2141         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2142             (fl6->flowi6_oif && any_src))
2143                 flags |= RT6_LOOKUP_F_IFACE;
2144
2145         if (!any_src)
2146                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2147         else if (sk)
2148                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2149
2150         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2151 }
2152 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2153
2154 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2155 {
2156         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2157         struct net_device *loopback_dev = net->loopback_dev;
2158         struct dst_entry *new = NULL;
2159
2160         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2161                        DST_OBSOLETE_DEAD, 0);
2162         if (rt) {
2163                 rt6_info_init(rt);
2164                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2165
2166                 new = &rt->dst;
2167                 new->__use = 1;
2168                 new->input = dst_discard;
2169                 new->output = dst_discard_out;
2170
2171                 dst_copy_metrics(new, &ort->dst);
2172
2173                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2174                 rt->rt6i_gateway = ort->rt6i_gateway;
2175                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2176
2177                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2178 #ifdef CONFIG_IPV6_SUBTREES
2179                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2180 #endif
2181         }
2182
2183         dst_release(dst_orig);
2184         return new ? new : ERR_PTR(-ENOMEM);
2185 }
2186
2187 /*
2188  *      Destination cache support functions
2189  */
2190
2191 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2192 {
2193         u32 rt_cookie = 0;
2194
2195         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2196                 return false;
2197
2198         if (fib6_check_expired(f6i))
2199                 return false;
2200
2201         return true;
2202 }
2203
2204 static struct dst_entry *rt6_check(struct rt6_info *rt,
2205                                    struct fib6_info *from,
2206                                    u32 cookie)
2207 {
2208         u32 rt_cookie = 0;
2209
2210         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2211             rt_cookie != cookie)
2212                 return NULL;
2213
2214         if (rt6_check_expired(rt))
2215                 return NULL;
2216
2217         return &rt->dst;
2218 }
2219
2220 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2221                                             struct fib6_info *from,
2222                                             u32 cookie)
2223 {
2224         if (!__rt6_check_expired(rt) &&
2225             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2226             fib6_check(from, cookie))
2227                 return &rt->dst;
2228         else
2229                 return NULL;
2230 }
2231
2232 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2233 {
2234         struct dst_entry *dst_ret;
2235         struct fib6_info *from;
2236         struct rt6_info *rt;
2237
2238         rt = container_of(dst, struct rt6_info, dst);
2239
2240         rcu_read_lock();
2241
2242         /* All IPV6 dsts are created with ->obsolete set to the value
2243          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2244          * into this function always.
2245          */
2246
2247         from = rcu_dereference(rt->from);
2248
2249         if (from && (rt->rt6i_flags & RTF_PCPU ||
2250             unlikely(!list_empty(&rt->rt6i_uncached))))
2251                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2252         else
2253                 dst_ret = rt6_check(rt, from, cookie);
2254
2255         rcu_read_unlock();
2256
2257         return dst_ret;
2258 }
2259
2260 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2261 {
2262         struct rt6_info *rt = (struct rt6_info *) dst;
2263
2264         if (rt) {
2265                 if (rt->rt6i_flags & RTF_CACHE) {
2266                         rcu_read_lock();
2267                         if (rt6_check_expired(rt)) {
2268                                 rt6_remove_exception_rt(rt);
2269                                 dst = NULL;
2270                         }
2271                         rcu_read_unlock();
2272                 } else {
2273                         dst_release(dst);
2274                         dst = NULL;
2275                 }
2276         }
2277         return dst;
2278 }
2279
2280 static void ip6_link_failure(struct sk_buff *skb)
2281 {
2282         struct rt6_info *rt;
2283
2284         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2285
2286         rt = (struct rt6_info *) skb_dst(skb);
2287         if (rt) {
2288                 rcu_read_lock();
2289                 if (rt->rt6i_flags & RTF_CACHE) {
2290                         rt6_remove_exception_rt(rt);
2291                 } else {
2292                         struct fib6_info *from;
2293                         struct fib6_node *fn;
2294
2295                         from = rcu_dereference(rt->from);
2296                         if (from) {
2297                                 fn = rcu_dereference(from->fib6_node);
2298                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2299                                         fn->fn_sernum = -1;
2300                         }
2301                 }
2302                 rcu_read_unlock();
2303         }
2304 }
2305
2306 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2307 {
2308         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2309                 struct fib6_info *from;
2310
2311                 rcu_read_lock();
2312                 from = rcu_dereference(rt0->from);
2313                 if (from)
2314                         rt0->dst.expires = from->expires;
2315                 rcu_read_unlock();
2316         }
2317
2318         dst_set_expires(&rt0->dst, timeout);
2319         rt0->rt6i_flags |= RTF_EXPIRES;
2320 }
2321
2322 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2323 {
2324         struct net *net = dev_net(rt->dst.dev);
2325
2326         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2327         rt->rt6i_flags |= RTF_MODIFIED;
2328         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2329 }
2330
2331 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2332 {
2333         bool from_set;
2334
2335         rcu_read_lock();
2336         from_set = !!rcu_dereference(rt->from);
2337         rcu_read_unlock();
2338
2339         return !(rt->rt6i_flags & RTF_CACHE) &&
2340                 (rt->rt6i_flags & RTF_PCPU || from_set);
2341 }
2342
2343 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2344                                  const struct ipv6hdr *iph, u32 mtu)
2345 {
2346         const struct in6_addr *daddr, *saddr;
2347         struct rt6_info *rt6 = (struct rt6_info *)dst;
2348
2349         if (dst_metric_locked(dst, RTAX_MTU))
2350                 return;
2351
2352         if (iph) {
2353                 daddr = &iph->daddr;
2354                 saddr = &iph->saddr;
2355         } else if (sk) {
2356                 daddr = &sk->sk_v6_daddr;
2357                 saddr = &inet6_sk(sk)->saddr;
2358         } else {
2359                 daddr = NULL;
2360                 saddr = NULL;
2361         }
2362         dst_confirm_neigh(dst, daddr);
2363         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2364         if (mtu >= dst_mtu(dst))
2365                 return;
2366
2367         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2368                 rt6_do_update_pmtu(rt6, mtu);
2369                 /* update rt6_ex->stamp for cache */
2370                 if (rt6->rt6i_flags & RTF_CACHE)
2371                         rt6_update_exception_stamp_rt(rt6);
2372         } else if (daddr) {
2373                 struct fib6_info *from;
2374                 struct rt6_info *nrt6;
2375
2376                 rcu_read_lock();
2377                 from = rcu_dereference(rt6->from);
2378                 if (!from) {
2379                         rcu_read_unlock();
2380                         return;
2381                 }
2382                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2383                 if (nrt6) {
2384                         rt6_do_update_pmtu(nrt6, mtu);
2385                         if (rt6_insert_exception(nrt6, from))
2386                                 dst_release_immediate(&nrt6->dst);
2387                 }
2388                 rcu_read_unlock();
2389         }
2390 }
2391
2392 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2393                                struct sk_buff *skb, u32 mtu)
2394 {
2395         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2396 }
2397
2398 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2399                      int oif, u32 mark, kuid_t uid)
2400 {
2401         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2402         struct dst_entry *dst;
2403         struct flowi6 fl6;
2404
2405         memset(&fl6, 0, sizeof(fl6));
2406         fl6.flowi6_oif = oif;
2407         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2408         fl6.daddr = iph->daddr;
2409         fl6.saddr = iph->saddr;
2410         fl6.flowlabel = ip6_flowinfo(iph);
2411         fl6.flowi6_uid = uid;
2412
2413         dst = ip6_route_output(net, NULL, &fl6);
2414         if (!dst->error)
2415                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2416         dst_release(dst);
2417 }
2418 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2419
2420 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2421 {
2422         int oif = sk->sk_bound_dev_if;
2423         struct dst_entry *dst;
2424
2425         if (!oif && skb->dev)
2426                 oif = l3mdev_master_ifindex(skb->dev);
2427
2428         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2429
2430         dst = __sk_dst_get(sk);
2431         if (!dst || !dst->obsolete ||
2432             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2433                 return;
2434
2435         bh_lock_sock(sk);
2436         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2437                 ip6_datagram_dst_update(sk, false);
2438         bh_unlock_sock(sk);
2439 }
2440 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2441
2442 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2443                            const struct flowi6 *fl6)
2444 {
2445 #ifdef CONFIG_IPV6_SUBTREES
2446         struct ipv6_pinfo *np = inet6_sk(sk);
2447 #endif
2448
2449         ip6_dst_store(sk, dst,
2450                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2451                       &sk->sk_v6_daddr : NULL,
2452 #ifdef CONFIG_IPV6_SUBTREES
2453                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2454                       &np->saddr :
2455 #endif
2456                       NULL);
2457 }
2458
2459 /* Handle redirects */
2460 struct ip6rd_flowi {
2461         struct flowi6 fl6;
2462         struct in6_addr gateway;
2463 };
2464
2465 static struct rt6_info *__ip6_route_redirect(struct net *net,
2466                                              struct fib6_table *table,
2467                                              struct flowi6 *fl6,
2468                                              const struct sk_buff *skb,
2469                                              int flags)
2470 {
2471         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2472         struct rt6_info *ret = NULL, *rt_cache;
2473         struct fib6_info *rt;
2474         struct fib6_node *fn;
2475
2476         /* Get the "current" route for this destination and
2477          * check if the redirect has come from appropriate router.
2478          *
2479          * RFC 4861 specifies that redirects should only be
2480          * accepted if they come from the nexthop to the target.
2481          * Due to the way the routes are chosen, this notion
2482          * is a bit fuzzy and one might need to check all possible
2483          * routes.
2484          */
2485
2486         rcu_read_lock();
2487         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2488 restart:
2489         for_each_fib6_node_rt_rcu(fn) {
2490                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2491                         continue;
2492                 if (fib6_check_expired(rt))
2493                         continue;
2494                 if (rt->fib6_flags & RTF_REJECT)
2495                         break;
2496                 if (!(rt->fib6_flags & RTF_GATEWAY))
2497                         continue;
2498                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2499                         continue;
2500                 /* rt_cache's gateway might be different from its 'parent'
2501                  * in the case of an ip redirect.
2502                  * So we keep searching in the exception table if the gateway
2503                  * is different.
2504                  */
2505                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2506                         rt_cache = rt6_find_cached_rt(rt,
2507                                                       &fl6->daddr,
2508                                                       &fl6->saddr);
2509                         if (rt_cache &&
2510                             ipv6_addr_equal(&rdfl->gateway,
2511                                             &rt_cache->rt6i_gateway)) {
2512                                 ret = rt_cache;
2513                                 break;
2514                         }
2515                         continue;
2516                 }
2517                 break;
2518         }
2519
2520         if (!rt)
2521                 rt = net->ipv6.fib6_null_entry;
2522         else if (rt->fib6_flags & RTF_REJECT) {
2523                 ret = net->ipv6.ip6_null_entry;
2524                 goto out;
2525         }
2526
2527         if (rt == net->ipv6.fib6_null_entry) {
2528                 fn = fib6_backtrack(fn, &fl6->saddr);
2529                 if (fn)
2530                         goto restart;
2531         }
2532
2533 out:
2534         if (ret)
2535                 ip6_hold_safe(net, &ret, true);
2536         else
2537                 ret = ip6_create_rt_rcu(rt);
2538
2539         rcu_read_unlock();
2540
2541         trace_fib6_table_lookup(net, rt, table, fl6);
2542         return ret;
2543 };
2544
2545 static struct dst_entry *ip6_route_redirect(struct net *net,
2546                                             const struct flowi6 *fl6,
2547                                             const struct sk_buff *skb,
2548                                             const struct in6_addr *gateway)
2549 {
2550         int flags = RT6_LOOKUP_F_HAS_SADDR;
2551         struct ip6rd_flowi rdfl;
2552
2553         rdfl.fl6 = *fl6;
2554         rdfl.gateway = *gateway;
2555
2556         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2557                                 flags, __ip6_route_redirect);
2558 }
2559
2560 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2561                   kuid_t uid)
2562 {
2563         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2564         struct dst_entry *dst;
2565         struct flowi6 fl6;
2566
2567         memset(&fl6, 0, sizeof(fl6));
2568         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2569         fl6.flowi6_oif = oif;
2570         fl6.flowi6_mark = mark;
2571         fl6.daddr = iph->daddr;
2572         fl6.saddr = iph->saddr;
2573         fl6.flowlabel = ip6_flowinfo(iph);
2574         fl6.flowi6_uid = uid;
2575
2576         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2577         rt6_do_redirect(dst, NULL, skb);
2578         dst_release(dst);
2579 }
2580 EXPORT_SYMBOL_GPL(ip6_redirect);
2581
2582 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2583                             u32 mark)
2584 {
2585         const struct ipv6hdr *iph = ipv6_hdr(skb);
2586         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2587         struct dst_entry *dst;
2588         struct flowi6 fl6;
2589
2590         memset(&fl6, 0, sizeof(fl6));
2591         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2592         fl6.flowi6_oif = oif;
2593         fl6.flowi6_mark = mark;
2594         fl6.daddr = msg->dest;
2595         fl6.saddr = iph->daddr;
2596         fl6.flowi6_uid = sock_net_uid(net, NULL);
2597
2598         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2599         rt6_do_redirect(dst, NULL, skb);
2600         dst_release(dst);
2601 }
2602
2603 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2604 {
2605         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2606                      sk->sk_uid);
2607 }
2608 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2609
2610 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2611 {
2612         struct net_device *dev = dst->dev;
2613         unsigned int mtu = dst_mtu(dst);
2614         struct net *net = dev_net(dev);
2615
2616         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2617
2618         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2619                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2620
2621         /*
2622          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2623          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2624          * IPV6_MAXPLEN is also valid and means: "any MSS,
2625          * rely only on pmtu discovery"
2626          */
2627         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2628                 mtu = IPV6_MAXPLEN;
2629         return mtu;
2630 }
2631
2632 static unsigned int ip6_mtu(const struct dst_entry *dst)
2633 {
2634         struct inet6_dev *idev;
2635         unsigned int mtu;
2636
2637         mtu = dst_metric_raw(dst, RTAX_MTU);
2638         if (mtu)
2639                 goto out;
2640
2641         mtu = IPV6_MIN_MTU;
2642
2643         rcu_read_lock();
2644         idev = __in6_dev_get(dst->dev);
2645         if (idev)
2646                 mtu = idev->cnf.mtu6;
2647         rcu_read_unlock();
2648
2649 out:
2650         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2651
2652         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2653 }
2654
2655 /* MTU selection:
2656  * 1. mtu on route is locked - use it
2657  * 2. mtu from nexthop exception
2658  * 3. mtu from egress device
2659  *
2660  * based on ip6_dst_mtu_forward and exception logic of
2661  * rt6_find_cached_rt; called with rcu_read_lock
2662  */
2663 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2664                       struct in6_addr *saddr)
2665 {
2666         struct inet6_dev *idev;
2667         struct rt6_info *rt;
2668         u32 mtu = 0;
2669
2670         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2671                 mtu = f6i->fib6_pmtu;
2672                 if (mtu)
2673                         goto out;
2674         }
2675
2676         rt = rt6_find_cached_rt(f6i, daddr, saddr);
2677         if (unlikely(rt)) {
2678                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2679         } else {
2680                 struct net_device *dev = fib6_info_nh_dev(f6i);
2681
2682                 mtu = IPV6_MIN_MTU;
2683                 idev = __in6_dev_get(dev);
2684                 if (idev && idev->cnf.mtu6 > mtu)
2685                         mtu = idev->cnf.mtu6;
2686         }
2687
2688         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2689 out:
2690         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2691 }
2692
2693 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2694                                   struct flowi6 *fl6)
2695 {
2696         struct dst_entry *dst;
2697         struct rt6_info *rt;
2698         struct inet6_dev *idev = in6_dev_get(dev);
2699         struct net *net = dev_net(dev);
2700
2701         if (unlikely(!idev))
2702                 return ERR_PTR(-ENODEV);
2703
2704         rt = ip6_dst_alloc(net, dev, 0);
2705         if (unlikely(!rt)) {
2706                 in6_dev_put(idev);
2707                 dst = ERR_PTR(-ENOMEM);
2708                 goto out;
2709         }
2710
2711         rt->dst.flags |= DST_HOST;
2712         rt->dst.input = ip6_input;
2713         rt->dst.output  = ip6_output;
2714         rt->rt6i_gateway  = fl6->daddr;
2715         rt->rt6i_dst.addr = fl6->daddr;
2716         rt->rt6i_dst.plen = 128;
2717         rt->rt6i_idev     = idev;
2718         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2719
2720         /* Add this dst into uncached_list so that rt6_disable_ip() can
2721          * do proper release of the net_device
2722          */
2723         rt6_uncached_list_add(rt);
2724         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2725
2726         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2727
2728 out:
2729         return dst;
2730 }
2731
2732 static int ip6_dst_gc(struct dst_ops *ops)
2733 {
2734         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2735         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2736         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2737         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2738         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2739         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2740         int entries;
2741
2742         entries = dst_entries_get_fast(ops);
2743         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2744             entries <= rt_max_size)
2745                 goto out;
2746
2747         net->ipv6.ip6_rt_gc_expire++;
2748         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2749         entries = dst_entries_get_slow(ops);
2750         if (entries < ops->gc_thresh)
2751                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2752 out:
2753         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2754         return entries > rt_max_size;
2755 }
2756
2757 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2758                                struct fib6_config *cfg)
2759 {
2760         struct dst_metrics *p;
2761
2762         if (!cfg->fc_mx)
2763                 return 0;
2764
2765         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2766         if (unlikely(!p))
2767                 return -ENOMEM;
2768
2769         refcount_set(&p->refcnt, 1);
2770         rt->fib6_metrics = p;
2771
2772         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2773 }
2774
2775 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2776                                             struct fib6_config *cfg,
2777                                             const struct in6_addr *gw_addr,
2778                                             u32 tbid, int flags)
2779 {
2780         struct flowi6 fl6 = {
2781                 .flowi6_oif = cfg->fc_ifindex,
2782                 .daddr = *gw_addr,
2783                 .saddr = cfg->fc_prefsrc,
2784         };
2785         struct fib6_table *table;
2786         struct rt6_info *rt;
2787
2788         table = fib6_get_table(net, tbid);
2789         if (!table)
2790                 return NULL;
2791
2792         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2793                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2794
2795         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2796         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2797
2798         /* if table lookup failed, fall back to full lookup */
2799         if (rt == net->ipv6.ip6_null_entry) {
2800                 ip6_rt_put(rt);
2801                 rt = NULL;
2802         }
2803
2804         return rt;
2805 }
2806
2807 static int ip6_route_check_nh_onlink(struct net *net,
2808                                      struct fib6_config *cfg,
2809                                      const struct net_device *dev,
2810                                      struct netlink_ext_ack *extack)
2811 {
2812         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2813         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2814         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2815         struct fib6_info *from;
2816         struct rt6_info *grt;
2817         int err;
2818
2819         err = 0;
2820         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2821         if (grt) {
2822                 rcu_read_lock();
2823                 from = rcu_dereference(grt->from);
2824                 if (!grt->dst.error &&
2825                     /* ignore match if it is the default route */
2826                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2827                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2828                         NL_SET_ERR_MSG(extack,
2829                                        "Nexthop has invalid gateway or device mismatch");
2830                         err = -EINVAL;
2831                 }
2832                 rcu_read_unlock();
2833
2834                 ip6_rt_put(grt);
2835         }
2836
2837         return err;
2838 }
2839
2840 static int ip6_route_check_nh(struct net *net,
2841                               struct fib6_config *cfg,
2842                               struct net_device **_dev,
2843                               struct inet6_dev **idev)
2844 {
2845         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2846         struct net_device *dev = _dev ? *_dev : NULL;
2847         struct rt6_info *grt = NULL;
2848         int err = -EHOSTUNREACH;
2849
2850         if (cfg->fc_table) {
2851                 int flags = RT6_LOOKUP_F_IFACE;
2852
2853                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2854                                           cfg->fc_table, flags);
2855                 if (grt) {
2856                         if (grt->rt6i_flags & RTF_GATEWAY ||
2857                             (dev && dev != grt->dst.dev)) {
2858                                 ip6_rt_put(grt);
2859                                 grt = NULL;
2860                         }
2861                 }
2862         }
2863
2864         if (!grt)
2865                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2866
2867         if (!grt)
2868                 goto out;
2869
2870         if (dev) {
2871                 if (dev != grt->dst.dev) {
2872                         ip6_rt_put(grt);
2873                         goto out;
2874                 }
2875         } else {
2876                 *_dev = dev = grt->dst.dev;
2877                 *idev = grt->rt6i_idev;
2878                 dev_hold(dev);
2879                 in6_dev_hold(grt->rt6i_idev);
2880         }
2881
2882         if (!(grt->rt6i_flags & RTF_GATEWAY))
2883                 err = 0;
2884
2885         ip6_rt_put(grt);
2886
2887 out:
2888         return err;
2889 }
2890
2891 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2892                            struct net_device **_dev, struct inet6_dev **idev,
2893                            struct netlink_ext_ack *extack)
2894 {
2895         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2896         int gwa_type = ipv6_addr_type(gw_addr);
2897         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2898         const struct net_device *dev = *_dev;
2899         bool need_addr_check = !dev;
2900         int err = -EINVAL;
2901
2902         /* if gw_addr is local we will fail to detect this in case
2903          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2904          * will return already-added prefix route via interface that
2905          * prefix route was assigned to, which might be non-loopback.
2906          */
2907         if (dev &&
2908             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2909                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2910                 goto out;
2911         }
2912
2913         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2914                 /* IPv6 strictly inhibits using not link-local
2915                  * addresses as nexthop address.
2916                  * Otherwise, router will not able to send redirects.
2917                  * It is very good, but in some (rare!) circumstances
2918                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2919                  * some exceptions. --ANK
2920                  * We allow IPv4-mapped nexthops to support RFC4798-type
2921                  * addressing
2922                  */
2923                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2924                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2925                         goto out;
2926                 }
2927
2928                 if (cfg->fc_flags & RTNH_F_ONLINK)
2929                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2930                 else
2931                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2932
2933                 if (err)
2934                         goto out;
2935         }
2936
2937         /* reload in case device was changed */
2938         dev = *_dev;
2939
2940         err = -EINVAL;
2941         if (!dev) {
2942                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2943                 goto out;
2944         } else if (dev->flags & IFF_LOOPBACK) {
2945                 NL_SET_ERR_MSG(extack,
2946                                "Egress device can not be loopback device for this route");
2947                 goto out;
2948         }
2949
2950         /* if we did not check gw_addr above, do so now that the
2951          * egress device has been resolved.
2952          */
2953         if (need_addr_check &&
2954             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2955                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2956                 goto out;
2957         }
2958
2959         err = 0;
2960 out:
2961         return err;
2962 }
2963
2964 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2965                                               gfp_t gfp_flags,
2966                                               struct netlink_ext_ack *extack)
2967 {
2968         struct net *net = cfg->fc_nlinfo.nl_net;
2969         struct fib6_info *rt = NULL;
2970         struct net_device *dev = NULL;
2971         struct inet6_dev *idev = NULL;
2972         struct fib6_table *table;
2973         int addr_type;
2974         int err = -EINVAL;
2975
2976         /* RTF_PCPU is an internal flag; can not be set by userspace */
2977         if (cfg->fc_flags & RTF_PCPU) {
2978                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2979                 goto out;
2980         }
2981
2982         /* RTF_CACHE is an internal flag; can not be set by userspace */
2983         if (cfg->fc_flags & RTF_CACHE) {
2984                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2985                 goto out;
2986         }
2987
2988         if (cfg->fc_type > RTN_MAX) {
2989                 NL_SET_ERR_MSG(extack, "Invalid route type");
2990                 goto out;
2991         }
2992
2993         if (cfg->fc_dst_len > 128) {
2994                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2995                 goto out;
2996         }
2997         if (cfg->fc_src_len > 128) {
2998                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2999                 goto out;
3000         }
3001 #ifndef CONFIG_IPV6_SUBTREES
3002         if (cfg->fc_src_len) {
3003                 NL_SET_ERR_MSG(extack,
3004                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3005                 goto out;
3006         }
3007 #endif
3008         if (cfg->fc_ifindex) {
3009                 err = -ENODEV;
3010                 dev = dev_get_by_index(net, cfg->fc_ifindex);
3011                 if (!dev)
3012                         goto out;
3013                 idev = in6_dev_get(dev);
3014                 if (!idev)
3015                         goto out;
3016         }
3017
3018         if (cfg->fc_metric == 0)
3019                 cfg->fc_metric = IP6_RT_PRIO_USER;
3020
3021         if (cfg->fc_flags & RTNH_F_ONLINK) {
3022                 if (!dev) {
3023                         NL_SET_ERR_MSG(extack,
3024                                        "Nexthop device required for onlink");
3025                         err = -ENODEV;
3026                         goto out;
3027                 }
3028
3029                 if (!(dev->flags & IFF_UP)) {
3030                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3031                         err = -ENETDOWN;
3032                         goto out;
3033                 }
3034         }
3035
3036         err = -ENOBUFS;
3037         if (cfg->fc_nlinfo.nlh &&
3038             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3039                 table = fib6_get_table(net, cfg->fc_table);
3040                 if (!table) {
3041                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3042                         table = fib6_new_table(net, cfg->fc_table);
3043                 }
3044         } else {
3045                 table = fib6_new_table(net, cfg->fc_table);
3046         }
3047
3048         if (!table)
3049                 goto out;
3050
3051         err = -ENOMEM;
3052         rt = fib6_info_alloc(gfp_flags);
3053         if (!rt)
3054                 goto out;
3055
3056         if (cfg->fc_flags & RTF_ADDRCONF)
3057                 rt->dst_nocount = true;
3058
3059         err = ip6_convert_metrics(net, rt, cfg);
3060         if (err < 0)
3061                 goto out;
3062
3063         if (cfg->fc_flags & RTF_EXPIRES)
3064                 fib6_set_expires(rt, jiffies +
3065                                 clock_t_to_jiffies(cfg->fc_expires));
3066         else
3067                 fib6_clean_expires(rt);
3068
3069         if (cfg->fc_protocol == RTPROT_UNSPEC)
3070                 cfg->fc_protocol = RTPROT_BOOT;
3071         rt->fib6_protocol = cfg->fc_protocol;
3072
3073         addr_type = ipv6_addr_type(&cfg->fc_dst);
3074
3075         if (cfg->fc_encap) {
3076                 struct lwtunnel_state *lwtstate;
3077
3078                 err = lwtunnel_build_state(cfg->fc_encap_type,
3079                                            cfg->fc_encap, AF_INET6, cfg,
3080                                            &lwtstate, extack);
3081                 if (err)
3082                         goto out;
3083                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3084         }
3085
3086         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3087         rt->fib6_dst.plen = cfg->fc_dst_len;
3088         if (rt->fib6_dst.plen == 128)
3089                 rt->dst_host = true;
3090
3091 #ifdef CONFIG_IPV6_SUBTREES
3092         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3093         rt->fib6_src.plen = cfg->fc_src_len;
3094 #endif
3095
3096         rt->fib6_metric = cfg->fc_metric;
3097         rt->fib6_nh.nh_weight = 1;
3098
3099         rt->fib6_type = cfg->fc_type;
3100
3101         /* We cannot add true routes via loopback here,
3102            they would result in kernel looping; promote them to reject routes
3103          */
3104         if ((cfg->fc_flags & RTF_REJECT) ||
3105             (dev && (dev->flags & IFF_LOOPBACK) &&
3106              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3107              !(cfg->fc_flags & RTF_LOCAL))) {
3108                 /* hold loopback dev/idev if we haven't done so. */
3109                 if (dev != net->loopback_dev) {
3110                         if (dev) {
3111                                 dev_put(dev);
3112                                 in6_dev_put(idev);
3113                         }
3114                         dev = net->loopback_dev;
3115                         dev_hold(dev);
3116                         idev = in6_dev_get(dev);
3117                         if (!idev) {
3118                                 err = -ENODEV;
3119                                 goto out;
3120                         }
3121                 }
3122                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3123                 goto install_route;
3124         }
3125
3126         if (cfg->fc_flags & RTF_GATEWAY) {
3127                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3128                 if (err)
3129                         goto out;
3130
3131                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3132         }
3133
3134         err = -ENODEV;
3135         if (!dev)
3136                 goto out;
3137
3138         if (idev->cnf.disable_ipv6) {
3139                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3140                 err = -EACCES;
3141                 goto out;
3142         }
3143
3144         if (!(dev->flags & IFF_UP)) {
3145                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3146                 err = -ENETDOWN;
3147                 goto out;
3148         }
3149
3150         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3151                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3152                         NL_SET_ERR_MSG(extack, "Invalid source address");
3153                         err = -EINVAL;
3154                         goto out;
3155                 }
3156                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3157                 rt->fib6_prefsrc.plen = 128;
3158         } else
3159                 rt->fib6_prefsrc.plen = 0;
3160
3161         rt->fib6_flags = cfg->fc_flags;
3162
3163 install_route:
3164         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3165             !netif_carrier_ok(dev))
3166                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3167         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3168         rt->fib6_nh.nh_dev = dev;
3169         rt->fib6_table = table;
3170
3171         cfg->fc_nlinfo.nl_net = dev_net(dev);
3172
3173         if (idev)
3174                 in6_dev_put(idev);
3175
3176         return rt;
3177 out:
3178         if (dev)
3179                 dev_put(dev);
3180         if (idev)
3181                 in6_dev_put(idev);
3182
3183         fib6_info_release(rt);
3184         return ERR_PTR(err);
3185 }
3186
3187 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3188                   struct netlink_ext_ack *extack)
3189 {
3190         struct fib6_info *rt;
3191         int err;
3192
3193         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3194         if (IS_ERR(rt))
3195                 return PTR_ERR(rt);
3196
3197         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3198         fib6_info_release(rt);
3199
3200         return err;
3201 }
3202
3203 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3204 {
3205         struct net *net = info->nl_net;
3206         struct fib6_table *table;
3207         int err;
3208
3209         if (rt == net->ipv6.fib6_null_entry) {
3210                 err = -ENOENT;
3211                 goto out;
3212         }
3213
3214         table = rt->fib6_table;
3215         spin_lock_bh(&table->tb6_lock);
3216         err = fib6_del(rt, info);
3217         spin_unlock_bh(&table->tb6_lock);
3218
3219 out:
3220         fib6_info_release(rt);
3221         return err;
3222 }
3223
3224 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3225 {
3226         struct nl_info info = { .nl_net = net };
3227
3228         return __ip6_del_rt(rt, &info);
3229 }
3230
3231 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3232 {
3233         struct nl_info *info = &cfg->fc_nlinfo;
3234         struct net *net = info->nl_net;
3235         struct sk_buff *skb = NULL;
3236         struct fib6_table *table;
3237         int err = -ENOENT;
3238
3239         if (rt == net->ipv6.fib6_null_entry)
3240                 goto out_put;
3241         table = rt->fib6_table;
3242         spin_lock_bh(&table->tb6_lock);
3243
3244         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3245                 struct fib6_info *sibling, *next_sibling;
3246
3247                 /* prefer to send a single notification with all hops */
3248                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3249                 if (skb) {
3250                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3251
3252                         if (rt6_fill_node(net, skb, rt, NULL,
3253                                           NULL, NULL, 0, RTM_DELROUTE,
3254                                           info->portid, seq, 0) < 0) {
3255                                 kfree_skb(skb);
3256                                 skb = NULL;
3257                         } else
3258                                 info->skip_notify = 1;
3259                 }
3260
3261                 list_for_each_entry_safe(sibling, next_sibling,
3262                                          &rt->fib6_siblings,
3263                                          fib6_siblings) {
3264                         err = fib6_del(sibling, info);
3265                         if (err)
3266                                 goto out_unlock;
3267                 }
3268         }
3269
3270         err = fib6_del(rt, info);
3271 out_unlock:
3272         spin_unlock_bh(&table->tb6_lock);
3273 out_put:
3274         fib6_info_release(rt);
3275
3276         if (skb) {
3277                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3278                             info->nlh, gfp_any());
3279         }
3280         return err;
3281 }
3282
3283 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3284 {
3285         int rc = -ESRCH;
3286
3287         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3288                 goto out;
3289
3290         if (cfg->fc_flags & RTF_GATEWAY &&
3291             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3292                 goto out;
3293
3294         rc = rt6_remove_exception_rt(rt);
3295 out:
3296         return rc;
3297 }
3298
3299 static int ip6_route_del(struct fib6_config *cfg,
3300                          struct netlink_ext_ack *extack)
3301 {
3302         struct rt6_info *rt_cache;
3303         struct fib6_table *table;
3304         struct fib6_info *rt;
3305         struct fib6_node *fn;
3306         int err = -ESRCH;
3307
3308         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3309         if (!table) {
3310                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3311                 return err;
3312         }
3313
3314         rcu_read_lock();
3315
3316         fn = fib6_locate(&table->tb6_root,
3317                          &cfg->fc_dst, cfg->fc_dst_len,
3318                          &cfg->fc_src, cfg->fc_src_len,
3319                          !(cfg->fc_flags & RTF_CACHE));
3320
3321         if (fn) {
3322                 for_each_fib6_node_rt_rcu(fn) {
3323                         if (cfg->fc_flags & RTF_CACHE) {
3324                                 int rc;
3325
3326                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3327                                                               &cfg->fc_src);
3328                                 if (rt_cache) {
3329                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3330                                         if (rc != -ESRCH) {
3331                                                 rcu_read_unlock();
3332                                                 return rc;
3333                                         }
3334                                 }
3335                                 continue;
3336                         }
3337                         if (cfg->fc_ifindex &&
3338                             (!rt->fib6_nh.nh_dev ||
3339                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3340                                 continue;
3341                         if (cfg->fc_flags & RTF_GATEWAY &&
3342                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3343                                 continue;
3344                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3345                                 continue;
3346                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3347                                 continue;
3348                         if (!fib6_info_hold_safe(rt))
3349                                 continue;
3350                         rcu_read_unlock();
3351
3352                         /* if gateway was specified only delete the one hop */
3353                         if (cfg->fc_flags & RTF_GATEWAY)
3354                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3355
3356                         return __ip6_del_rt_siblings(rt, cfg);
3357                 }
3358         }
3359         rcu_read_unlock();
3360
3361         return err;
3362 }
3363
3364 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3365 {
3366         struct netevent_redirect netevent;
3367         struct rt6_info *rt, *nrt = NULL;
3368         struct ndisc_options ndopts;
3369         struct inet6_dev *in6_dev;
3370         struct neighbour *neigh;
3371         struct fib6_info *from;
3372         struct rd_msg *msg;
3373         int optlen, on_link;
3374         u8 *lladdr;
3375
3376         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3377         optlen -= sizeof(*msg);
3378
3379         if (optlen < 0) {
3380                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3381                 return;
3382         }
3383
3384         msg = (struct rd_msg *)icmp6_hdr(skb);
3385
3386         if (ipv6_addr_is_multicast(&msg->dest)) {
3387                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3388                 return;
3389         }
3390
3391         on_link = 0;
3392         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3393                 on_link = 1;
3394         } else if (ipv6_addr_type(&msg->target) !=
3395                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3396                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3397                 return;
3398         }
3399
3400         in6_dev = __in6_dev_get(skb->dev);
3401         if (!in6_dev)
3402                 return;
3403         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3404                 return;
3405
3406         /* RFC2461 8.1:
3407          *      The IP source address of the Redirect MUST be the same as the current
3408          *      first-hop router for the specified ICMP Destination Address.
3409          */
3410
3411         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3412                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3413                 return;
3414         }
3415
3416         lladdr = NULL;
3417         if (ndopts.nd_opts_tgt_lladdr) {
3418                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3419                                              skb->dev);
3420                 if (!lladdr) {
3421                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3422                         return;
3423                 }
3424         }
3425
3426         rt = (struct rt6_info *) dst;
3427         if (rt->rt6i_flags & RTF_REJECT) {
3428                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3429                 return;
3430         }
3431
3432         /* Redirect received -> path was valid.
3433          * Look, redirects are sent only in response to data packets,
3434          * so that this nexthop apparently is reachable. --ANK
3435          */
3436         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3437
3438         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3439         if (!neigh)
3440                 return;
3441
3442         /*
3443          *      We have finally decided to accept it.
3444          */
3445
3446         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3447                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3448                      NEIGH_UPDATE_F_OVERRIDE|
3449                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3450                                      NEIGH_UPDATE_F_ISROUTER)),
3451                      NDISC_REDIRECT, &ndopts);
3452
3453         rcu_read_lock();
3454         from = rcu_dereference(rt->from);
3455         if (!from)
3456                 goto out;
3457
3458         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3459         if (!nrt)
3460                 goto out;
3461
3462         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3463         if (on_link)
3464                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3465
3466         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3467
3468         /* rt6_insert_exception() will take care of duplicated exceptions */
3469         if (rt6_insert_exception(nrt, from)) {
3470                 dst_release_immediate(&nrt->dst);
3471                 goto out;
3472         }
3473
3474         netevent.old = &rt->dst;
3475         netevent.new = &nrt->dst;
3476         netevent.daddr = &msg->dest;
3477         netevent.neigh = neigh;
3478         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3479
3480 out:
3481         rcu_read_unlock();
3482         neigh_release(neigh);
3483 }
3484
3485 #ifdef CONFIG_IPV6_ROUTE_INFO
3486 static struct fib6_info *rt6_get_route_info(struct net *net,
3487                                            const struct in6_addr *prefix, int prefixlen,
3488                                            const struct in6_addr *gwaddr,
3489                                            struct net_device *dev)
3490 {
3491         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3492         int ifindex = dev->ifindex;
3493         struct fib6_node *fn;
3494         struct fib6_info *rt = NULL;
3495         struct fib6_table *table;
3496
3497         table = fib6_get_table(net, tb_id);
3498         if (!table)
3499                 return NULL;
3500
3501         rcu_read_lock();
3502         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3503         if (!fn)
3504                 goto out;
3505
3506         for_each_fib6_node_rt_rcu(fn) {
3507                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3508                         continue;
3509                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3510                         continue;
3511                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3512                         continue;
3513                 if (!fib6_info_hold_safe(rt))
3514                         continue;
3515                 break;
3516         }
3517 out:
3518         rcu_read_unlock();
3519         return rt;
3520 }
3521
3522 static struct fib6_info *rt6_add_route_info(struct net *net,
3523                                            const struct in6_addr *prefix, int prefixlen,
3524                                            const struct in6_addr *gwaddr,
3525                                            struct net_device *dev,
3526                                            unsigned int pref)
3527 {
3528         struct fib6_config cfg = {
3529                 .fc_metric      = IP6_RT_PRIO_USER,
3530                 .fc_ifindex     = dev->ifindex,
3531                 .fc_dst_len     = prefixlen,
3532                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3533                                   RTF_UP | RTF_PREF(pref),
3534                 .fc_protocol = RTPROT_RA,
3535                 .fc_type = RTN_UNICAST,
3536                 .fc_nlinfo.portid = 0,
3537                 .fc_nlinfo.nlh = NULL,
3538                 .fc_nlinfo.nl_net = net,
3539         };
3540
3541         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3542         cfg.fc_dst = *prefix;
3543         cfg.fc_gateway = *gwaddr;
3544
3545         /* We should treat it as a default route if prefix length is 0. */
3546         if (!prefixlen)
3547                 cfg.fc_flags |= RTF_DEFAULT;
3548
3549         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3550
3551         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3552 }
3553 #endif
3554
3555 struct fib6_info *rt6_get_dflt_router(struct net *net,
3556                                      const struct in6_addr *addr,
3557                                      struct net_device *dev)
3558 {
3559         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3560         struct fib6_info *rt;
3561         struct fib6_table *table;
3562
3563         table = fib6_get_table(net, tb_id);
3564         if (!table)
3565                 return NULL;
3566
3567         rcu_read_lock();
3568         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3569                 if (dev == rt->fib6_nh.nh_dev &&
3570                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3571                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3572                         break;
3573         }
3574         if (rt && !fib6_info_hold_safe(rt))
3575                 rt = NULL;
3576         rcu_read_unlock();
3577         return rt;
3578 }
3579
3580 struct fib6_info *rt6_add_dflt_router(struct net *net,
3581                                      const struct in6_addr *gwaddr,
3582                                      struct net_device *dev,
3583                                      unsigned int pref)
3584 {
3585         struct fib6_config cfg = {
3586                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3587                 .fc_metric      = IP6_RT_PRIO_USER,
3588                 .fc_ifindex     = dev->ifindex,
3589                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3590                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3591                 .fc_protocol = RTPROT_RA,
3592                 .fc_type = RTN_UNICAST,
3593                 .fc_nlinfo.portid = 0,
3594                 .fc_nlinfo.nlh = NULL,
3595                 .fc_nlinfo.nl_net = net,
3596         };
3597
3598         cfg.fc_gateway = *gwaddr;
3599
3600         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3601                 struct fib6_table *table;
3602
3603                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3604                 if (table)
3605                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3606         }
3607
3608         return rt6_get_dflt_router(net, gwaddr, dev);
3609 }
3610
3611 static void __rt6_purge_dflt_routers(struct net *net,
3612                                      struct fib6_table *table)
3613 {
3614         struct fib6_info *rt;
3615
3616 restart:
3617         rcu_read_lock();
3618         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3619                 struct net_device *dev = fib6_info_nh_dev(rt);
3620                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3621
3622                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3623                     (!idev || idev->cnf.accept_ra != 2) &&
3624                     fib6_info_hold_safe(rt)) {
3625                         rcu_read_unlock();
3626                         ip6_del_rt(net, rt);
3627                         goto restart;
3628                 }
3629         }
3630         rcu_read_unlock();
3631
3632         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3633 }
3634
3635 void rt6_purge_dflt_routers(struct net *net)
3636 {
3637         struct fib6_table *table;
3638         struct hlist_head *head;
3639         unsigned int h;
3640
3641         rcu_read_lock();
3642
3643         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3644                 head = &net->ipv6.fib_table_hash[h];
3645                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3646                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3647                                 __rt6_purge_dflt_routers(net, table);
3648                 }
3649         }
3650
3651         rcu_read_unlock();
3652 }
3653
3654 static void rtmsg_to_fib6_config(struct net *net,
3655                                  struct in6_rtmsg *rtmsg,
3656                                  struct fib6_config *cfg)
3657 {
3658         memset(cfg, 0, sizeof(*cfg));
3659
3660         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3661                          : RT6_TABLE_MAIN;
3662         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3663         cfg->fc_metric = rtmsg->rtmsg_metric;
3664         cfg->fc_expires = rtmsg->rtmsg_info;
3665         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3666         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3667         cfg->fc_flags = rtmsg->rtmsg_flags;
3668         cfg->fc_type = rtmsg->rtmsg_type;
3669
3670         cfg->fc_nlinfo.nl_net = net;
3671
3672         cfg->fc_dst = rtmsg->rtmsg_dst;
3673         cfg->fc_src = rtmsg->rtmsg_src;
3674         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3675 }
3676
3677 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3678 {
3679         struct fib6_config cfg;
3680         struct in6_rtmsg rtmsg;
3681         int err;
3682
3683         switch (cmd) {
3684         case SIOCADDRT:         /* Add a route */
3685         case SIOCDELRT:         /* Delete a route */
3686                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3687                         return -EPERM;
3688                 err = copy_from_user(&rtmsg, arg,
3689                                      sizeof(struct in6_rtmsg));
3690                 if (err)
3691                         return -EFAULT;
3692
3693                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3694
3695                 rtnl_lock();
3696                 switch (cmd) {
3697                 case SIOCADDRT:
3698                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3699                         break;
3700                 case SIOCDELRT:
3701                         err = ip6_route_del(&cfg, NULL);
3702                         break;
3703                 default:
3704                         err = -EINVAL;
3705                 }
3706                 rtnl_unlock();
3707
3708                 return err;
3709         }
3710
3711         return -EINVAL;
3712 }
3713
3714 /*
3715  *      Drop the packet on the floor
3716  */
3717
3718 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3719 {
3720         int type;
3721         struct dst_entry *dst = skb_dst(skb);
3722         switch (ipstats_mib_noroutes) {
3723         case IPSTATS_MIB_INNOROUTES:
3724                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3725                 if (type == IPV6_ADDR_ANY) {
3726                         IP6_INC_STATS(dev_net(dst->dev),
3727                                       __in6_dev_get_safely(skb->dev),
3728                                       IPSTATS_MIB_INADDRERRORS);
3729                         break;
3730                 }
3731                 /* FALLTHROUGH */
3732         case IPSTATS_MIB_OUTNOROUTES:
3733                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3734                               ipstats_mib_noroutes);
3735                 break;
3736         }
3737         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3738         kfree_skb(skb);
3739         return 0;
3740 }
3741
3742 static int ip6_pkt_discard(struct sk_buff *skb)
3743 {
3744         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3745 }
3746
3747 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3748 {
3749         skb->dev = skb_dst(skb)->dev;
3750         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3751 }
3752
3753 static int ip6_pkt_prohibit(struct sk_buff *skb)
3754 {
3755         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3756 }
3757
3758 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3759 {
3760         skb->dev = skb_dst(skb)->dev;
3761         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3762 }
3763
3764 /*
3765  *      Allocate a dst for local (unicast / anycast) address.
3766  */
3767
3768 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3769                                      struct inet6_dev *idev,
3770                                      const struct in6_addr *addr,
3771                                      bool anycast, gfp_t gfp_flags)
3772 {
3773         u32 tb_id;
3774         struct net_device *dev = idev->dev;
3775         struct fib6_info *f6i;
3776
3777         f6i = fib6_info_alloc(gfp_flags);
3778         if (!f6i)
3779                 return ERR_PTR(-ENOMEM);
3780
3781         f6i->dst_nocount = true;
3782         f6i->dst_host = true;
3783         f6i->fib6_protocol = RTPROT_KERNEL;
3784         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3785         if (anycast) {
3786                 f6i->fib6_type = RTN_ANYCAST;
3787                 f6i->fib6_flags |= RTF_ANYCAST;
3788         } else {
3789                 f6i->fib6_type = RTN_LOCAL;
3790                 f6i->fib6_flags |= RTF_LOCAL;
3791         }
3792
3793         f6i->fib6_nh.nh_gw = *addr;
3794         dev_hold(dev);
3795         f6i->fib6_nh.nh_dev = dev;
3796         f6i->fib6_dst.addr = *addr;
3797         f6i->fib6_dst.plen = 128;
3798         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3799         f6i->fib6_table = fib6_get_table(net, tb_id);
3800
3801         return f6i;
3802 }
3803
3804 /* remove deleted ip from prefsrc entries */
3805 struct arg_dev_net_ip {
3806         struct net_device *dev;
3807         struct net *net;
3808         struct in6_addr *addr;
3809 };
3810
3811 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3812 {
3813         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3814         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3815         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3816
3817         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3818             rt != net->ipv6.fib6_null_entry &&
3819             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3820                 spin_lock_bh(&rt6_exception_lock);
3821                 /* remove prefsrc entry */
3822                 rt->fib6_prefsrc.plen = 0;
3823                 /* need to update cache as well */
3824                 rt6_exceptions_remove_prefsrc(rt);
3825                 spin_unlock_bh(&rt6_exception_lock);
3826         }
3827         return 0;
3828 }
3829
3830 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3831 {
3832         struct net *net = dev_net(ifp->idev->dev);
3833         struct arg_dev_net_ip adni = {
3834                 .dev = ifp->idev->dev,
3835                 .net = net,
3836                 .addr = &ifp->addr,
3837         };
3838         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3839 }
3840
3841 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3842
3843 /* Remove routers and update dst entries when gateway turn into host. */
3844 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3845 {
3846         struct in6_addr *gateway = (struct in6_addr *)arg;
3847
3848         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3849             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3850                 return -1;
3851         }
3852
3853         /* Further clean up cached routes in exception table.
3854          * This is needed because cached route may have a different
3855          * gateway than its 'parent' in the case of an ip redirect.
3856          */
3857         rt6_exceptions_clean_tohost(rt, gateway);
3858
3859         return 0;
3860 }
3861
3862 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3863 {
3864         fib6_clean_all(net, fib6_clean_tohost, gateway);
3865 }
3866
3867 struct arg_netdev_event {
3868         const struct net_device *dev;
3869         union {
3870                 unsigned int nh_flags;
3871                 unsigned long event;
3872         };
3873 };
3874
3875 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3876 {
3877         struct fib6_info *iter;
3878         struct fib6_node *fn;
3879
3880         fn = rcu_dereference_protected(rt->fib6_node,
3881                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3882         iter = rcu_dereference_protected(fn->leaf,
3883                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3884         while (iter) {
3885                 if (iter->fib6_metric == rt->fib6_metric &&
3886                     rt6_qualify_for_ecmp(iter))
3887                         return iter;
3888                 iter = rcu_dereference_protected(iter->fib6_next,
3889                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3890         }
3891
3892         return NULL;
3893 }
3894
3895 static bool rt6_is_dead(const struct fib6_info *rt)
3896 {
3897         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3898             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3899              fib6_ignore_linkdown(rt)))
3900                 return true;
3901
3902         return false;
3903 }
3904
3905 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3906 {
3907         struct fib6_info *iter;
3908         int total = 0;
3909
3910         if (!rt6_is_dead(rt))
3911                 total += rt->fib6_nh.nh_weight;
3912
3913         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3914                 if (!rt6_is_dead(iter))
3915                         total += iter->fib6_nh.nh_weight;
3916         }
3917
3918         return total;
3919 }
3920
3921 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3922 {
3923         int upper_bound = -1;
3924
3925         if (!rt6_is_dead(rt)) {
3926                 *weight += rt->fib6_nh.nh_weight;
3927                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3928                                                     total) - 1;
3929         }
3930         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3931 }
3932
3933 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3934 {
3935         struct fib6_info *iter;
3936         int weight = 0;
3937
3938         rt6_upper_bound_set(rt, &weight, total);
3939
3940         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3941                 rt6_upper_bound_set(iter, &weight, total);
3942 }
3943
3944 void rt6_multipath_rebalance(struct fib6_info *rt)
3945 {
3946         struct fib6_info *first;
3947         int total;
3948
3949         /* In case the entire multipath route was marked for flushing,
3950          * then there is no need to rebalance upon the removal of every
3951          * sibling route.
3952          */
3953         if (!rt->fib6_nsiblings || rt->should_flush)
3954                 return;
3955
3956         /* During lookup routes are evaluated in order, so we need to
3957          * make sure upper bounds are assigned from the first sibling
3958          * onwards.
3959          */
3960         first = rt6_multipath_first_sibling(rt);
3961         if (WARN_ON_ONCE(!first))
3962                 return;
3963
3964         total = rt6_multipath_total_weight(first);
3965         rt6_multipath_upper_bound_set(first, total);
3966 }
3967
3968 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3969 {
3970         const struct arg_netdev_event *arg = p_arg;
3971         struct net *net = dev_net(arg->dev);
3972
3973         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3974                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3975                 fib6_update_sernum_upto_root(net, rt);
3976                 rt6_multipath_rebalance(rt);
3977         }
3978
3979         return 0;
3980 }
3981
3982 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3983 {
3984         struct arg_netdev_event arg = {
3985                 .dev = dev,
3986                 {
3987                         .nh_flags = nh_flags,
3988                 },
3989         };
3990
3991         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3992                 arg.nh_flags |= RTNH_F_LINKDOWN;
3993
3994         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3995 }
3996
3997 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3998                                    const struct net_device *dev)
3999 {
4000         struct fib6_info *iter;
4001
4002         if (rt->fib6_nh.nh_dev == dev)
4003                 return true;
4004         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005                 if (iter->fib6_nh.nh_dev == dev)
4006                         return true;
4007
4008         return false;
4009 }
4010
4011 static void rt6_multipath_flush(struct fib6_info *rt)
4012 {
4013         struct fib6_info *iter;
4014
4015         rt->should_flush = 1;
4016         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4017                 iter->should_flush = 1;
4018 }
4019
4020 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4021                                              const struct net_device *down_dev)
4022 {
4023         struct fib6_info *iter;
4024         unsigned int dead = 0;
4025
4026         if (rt->fib6_nh.nh_dev == down_dev ||
4027             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4028                 dead++;
4029         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4030                 if (iter->fib6_nh.nh_dev == down_dev ||
4031                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4032                         dead++;
4033
4034         return dead;
4035 }
4036
4037 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4038                                        const struct net_device *dev,
4039                                        unsigned int nh_flags)
4040 {
4041         struct fib6_info *iter;
4042
4043         if (rt->fib6_nh.nh_dev == dev)
4044                 rt->fib6_nh.nh_flags |= nh_flags;
4045         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4046                 if (iter->fib6_nh.nh_dev == dev)
4047                         iter->fib6_nh.nh_flags |= nh_flags;
4048 }
4049
4050 /* called with write lock held for table with rt */
4051 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4052 {
4053         const struct arg_netdev_event *arg = p_arg;
4054         const struct net_device *dev = arg->dev;
4055         struct net *net = dev_net(dev);
4056
4057         if (rt == net->ipv6.fib6_null_entry)
4058                 return 0;
4059
4060         switch (arg->event) {
4061         case NETDEV_UNREGISTER:
4062                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4063         case NETDEV_DOWN:
4064                 if (rt->should_flush)
4065                         return -1;
4066                 if (!rt->fib6_nsiblings)
4067                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4068                 if (rt6_multipath_uses_dev(rt, dev)) {
4069                         unsigned int count;
4070
4071                         count = rt6_multipath_dead_count(rt, dev);
4072                         if (rt->fib6_nsiblings + 1 == count) {
4073                                 rt6_multipath_flush(rt);
4074                                 return -1;
4075                         }
4076                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4077                                                    RTNH_F_LINKDOWN);
4078                         fib6_update_sernum(net, rt);
4079                         rt6_multipath_rebalance(rt);
4080                 }
4081                 return -2;
4082         case NETDEV_CHANGE:
4083                 if (rt->fib6_nh.nh_dev != dev ||
4084                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4085                         break;
4086                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4087                 rt6_multipath_rebalance(rt);
4088                 break;
4089         }
4090
4091         return 0;
4092 }
4093
4094 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4095 {
4096         struct arg_netdev_event arg = {
4097                 .dev = dev,
4098                 {
4099                         .event = event,
4100                 },
4101         };
4102
4103         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4104 }
4105
4106 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4107 {
4108         rt6_sync_down_dev(dev, event);
4109         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4110         neigh_ifdown(&nd_tbl, dev);
4111 }
4112
4113 struct rt6_mtu_change_arg {
4114         struct net_device *dev;
4115         unsigned int mtu;
4116 };
4117
4118 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4119 {
4120         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4121         struct inet6_dev *idev;
4122
4123         /* In IPv6 pmtu discovery is not optional,
4124            so that RTAX_MTU lock cannot disable it.
4125            We still use this lock to block changes
4126            caused by addrconf/ndisc.
4127         */
4128
4129         idev = __in6_dev_get(arg->dev);
4130         if (!idev)
4131                 return 0;
4132
4133         /* For administrative MTU increase, there is no way to discover
4134            IPv6 PMTU increase, so PMTU increase should be updated here.
4135            Since RFC 1981 doesn't include administrative MTU increase
4136            update PMTU increase is a MUST. (i.e. jumbo frame)
4137          */
4138         if (rt->fib6_nh.nh_dev == arg->dev &&
4139             !fib6_metric_locked(rt, RTAX_MTU)) {
4140                 u32 mtu = rt->fib6_pmtu;
4141
4142                 if (mtu >= arg->mtu ||
4143                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4144                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4145
4146                 spin_lock_bh(&rt6_exception_lock);
4147                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4148                 spin_unlock_bh(&rt6_exception_lock);
4149         }
4150         return 0;
4151 }
4152
4153 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4154 {
4155         struct rt6_mtu_change_arg arg = {
4156                 .dev = dev,
4157                 .mtu = mtu,
4158         };
4159
4160         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4161 }
4162
4163 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4164         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4165         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4166         [RTA_OIF]               = { .type = NLA_U32 },
4167         [RTA_IIF]               = { .type = NLA_U32 },
4168         [RTA_PRIORITY]          = { .type = NLA_U32 },
4169         [RTA_METRICS]           = { .type = NLA_NESTED },
4170         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4171         [RTA_PREF]              = { .type = NLA_U8 },
4172         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4173         [RTA_ENCAP]             = { .type = NLA_NESTED },
4174         [RTA_EXPIRES]           = { .type = NLA_U32 },
4175         [RTA_UID]               = { .type = NLA_U32 },
4176         [RTA_MARK]              = { .type = NLA_U32 },
4177         [RTA_TABLE]             = { .type = NLA_U32 },
4178         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4179         [RTA_SPORT]             = { .type = NLA_U16 },
4180         [RTA_DPORT]             = { .type = NLA_U16 },
4181 };
4182
4183 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4184                               struct fib6_config *cfg,
4185                               struct netlink_ext_ack *extack)
4186 {
4187         struct rtmsg *rtm;
4188         struct nlattr *tb[RTA_MAX+1];
4189         unsigned int pref;
4190         int err;
4191
4192         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4193                           NULL);
4194         if (err < 0)
4195                 goto errout;
4196
4197         err = -EINVAL;
4198         rtm = nlmsg_data(nlh);
4199         memset(cfg, 0, sizeof(*cfg));
4200
4201         cfg->fc_table = rtm->rtm_table;
4202         cfg->fc_dst_len = rtm->rtm_dst_len;
4203         cfg->fc_src_len = rtm->rtm_src_len;
4204         cfg->fc_flags = RTF_UP;
4205         cfg->fc_protocol = rtm->rtm_protocol;
4206         cfg->fc_type = rtm->rtm_type;
4207
4208         if (rtm->rtm_type == RTN_UNREACHABLE ||
4209             rtm->rtm_type == RTN_BLACKHOLE ||
4210             rtm->rtm_type == RTN_PROHIBIT ||
4211             rtm->rtm_type == RTN_THROW)
4212                 cfg->fc_flags |= RTF_REJECT;
4213
4214         if (rtm->rtm_type == RTN_LOCAL)
4215                 cfg->fc_flags |= RTF_LOCAL;
4216
4217         if (rtm->rtm_flags & RTM_F_CLONED)
4218                 cfg->fc_flags |= RTF_CACHE;
4219
4220         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4221
4222         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4223         cfg->fc_nlinfo.nlh = nlh;
4224         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4225
4226         if (tb[RTA_GATEWAY]) {
4227                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4228                 cfg->fc_flags |= RTF_GATEWAY;
4229         }
4230         if (tb[RTA_VIA]) {
4231                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4232                 goto errout;
4233         }
4234
4235         if (tb[RTA_DST]) {
4236                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4237
4238                 if (nla_len(tb[RTA_DST]) < plen)
4239                         goto errout;
4240
4241                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4242         }
4243
4244         if (tb[RTA_SRC]) {
4245                 int plen = (rtm->rtm_src_len + 7) >> 3;
4246
4247                 if (nla_len(tb[RTA_SRC]) < plen)
4248                         goto errout;
4249
4250                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4251         }
4252
4253         if (tb[RTA_PREFSRC])
4254                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4255
4256         if (tb[RTA_OIF])
4257                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4258
4259         if (tb[RTA_PRIORITY])
4260                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4261
4262         if (tb[RTA_METRICS]) {
4263                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4264                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4265         }
4266
4267         if (tb[RTA_TABLE])
4268                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4269
4270         if (tb[RTA_MULTIPATH]) {
4271                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4272                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4273
4274                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4275                                                      cfg->fc_mp_len, extack);
4276                 if (err < 0)
4277                         goto errout;
4278         }
4279
4280         if (tb[RTA_PREF]) {
4281                 pref = nla_get_u8(tb[RTA_PREF]);
4282                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4283                     pref != ICMPV6_ROUTER_PREF_HIGH)
4284                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4285                 cfg->fc_flags |= RTF_PREF(pref);
4286         }
4287
4288         if (tb[RTA_ENCAP])
4289                 cfg->fc_encap = tb[RTA_ENCAP];
4290
4291         if (tb[RTA_ENCAP_TYPE]) {
4292                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4293
4294                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4295                 if (err < 0)
4296                         goto errout;
4297         }
4298
4299         if (tb[RTA_EXPIRES]) {
4300                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4301
4302                 if (addrconf_finite_timeout(timeout)) {
4303                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4304                         cfg->fc_flags |= RTF_EXPIRES;
4305                 }
4306         }
4307
4308         err = 0;
4309 errout:
4310         return err;
4311 }
4312
4313 struct rt6_nh {
4314         struct fib6_info *fib6_info;
4315         struct fib6_config r_cfg;
4316         struct list_head next;
4317 };
4318
4319 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4320 {
4321         struct rt6_nh *nh;
4322
4323         list_for_each_entry(nh, rt6_nh_list, next) {
4324                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4325                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4326                         nh->r_cfg.fc_ifindex);
4327         }
4328 }
4329
4330 static int ip6_route_info_append(struct net *net,
4331                                  struct list_head *rt6_nh_list,
4332                                  struct fib6_info *rt,
4333                                  struct fib6_config *r_cfg)
4334 {
4335         struct rt6_nh *nh;
4336         int err = -EEXIST;
4337
4338         list_for_each_entry(nh, rt6_nh_list, next) {
4339                 /* check if fib6_info already exists */
4340                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4341                         return err;
4342         }
4343
4344         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4345         if (!nh)
4346                 return -ENOMEM;
4347         nh->fib6_info = rt;
4348         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4349         list_add_tail(&nh->next, rt6_nh_list);
4350
4351         return 0;
4352 }
4353
4354 static void ip6_route_mpath_notify(struct fib6_info *rt,
4355                                    struct fib6_info *rt_last,
4356                                    struct nl_info *info,
4357                                    __u16 nlflags)
4358 {
4359         /* if this is an APPEND route, then rt points to the first route
4360          * inserted and rt_last points to last route inserted. Userspace
4361          * wants a consistent dump of the route which starts at the first
4362          * nexthop. Since sibling routes are always added at the end of
4363          * the list, find the first sibling of the last route appended
4364          */
4365         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4366                 rt = list_first_entry(&rt_last->fib6_siblings,
4367                                       struct fib6_info,
4368                                       fib6_siblings);
4369         }
4370
4371         if (rt)
4372                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4373 }
4374
4375 static int ip6_route_multipath_add(struct fib6_config *cfg,
4376                                    struct netlink_ext_ack *extack)
4377 {
4378         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4379         struct nl_info *info = &cfg->fc_nlinfo;
4380         struct fib6_config r_cfg;
4381         struct rtnexthop *rtnh;
4382         struct fib6_info *rt;
4383         struct rt6_nh *err_nh;
4384         struct rt6_nh *nh, *nh_safe;
4385         __u16 nlflags;
4386         int remaining;
4387         int attrlen;
4388         int err = 1;
4389         int nhn = 0;
4390         int replace = (cfg->fc_nlinfo.nlh &&
4391                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4392         LIST_HEAD(rt6_nh_list);
4393
4394         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4395         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4396                 nlflags |= NLM_F_APPEND;
4397
4398         remaining = cfg->fc_mp_len;
4399         rtnh = (struct rtnexthop *)cfg->fc_mp;
4400
4401         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4402          * fib6_info structs per nexthop
4403          */
4404         while (rtnh_ok(rtnh, remaining)) {
4405                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4406                 if (rtnh->rtnh_ifindex)
4407                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4408
4409                 attrlen = rtnh_attrlen(rtnh);
4410                 if (attrlen > 0) {
4411                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4412
4413                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4414                         if (nla) {
4415                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4416                                 r_cfg.fc_flags |= RTF_GATEWAY;
4417                         }
4418                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4419                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4420                         if (nla)
4421                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4422                 }
4423
4424                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4425                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4426                 if (IS_ERR(rt)) {
4427                         err = PTR_ERR(rt);
4428                         rt = NULL;
4429                         goto cleanup;
4430                 }
4431                 if (!rt6_qualify_for_ecmp(rt)) {
4432                         err = -EINVAL;
4433                         NL_SET_ERR_MSG(extack,
4434                                        "Device only routes can not be added for IPv6 using the multipath API.");
4435                         fib6_info_release(rt);
4436                         goto cleanup;
4437                 }
4438
4439                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4440
4441                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4442                                             rt, &r_cfg);
4443                 if (err) {
4444                         fib6_info_release(rt);
4445                         goto cleanup;
4446                 }
4447
4448                 rtnh = rtnh_next(rtnh, &remaining);
4449         }
4450
4451         /* for add and replace send one notification with all nexthops.
4452          * Skip the notification in fib6_add_rt2node and send one with
4453          * the full route when done
4454          */
4455         info->skip_notify = 1;
4456
4457         err_nh = NULL;
4458         list_for_each_entry(nh, &rt6_nh_list, next) {
4459                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4460                 fib6_info_release(nh->fib6_info);
4461
4462                 if (!err) {
4463                         /* save reference to last route successfully inserted */
4464                         rt_last = nh->fib6_info;
4465
4466                         /* save reference to first route for notification */
4467                         if (!rt_notif)
4468                                 rt_notif = nh->fib6_info;
4469                 }
4470
4471                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4472                 nh->fib6_info = NULL;
4473                 if (err) {
4474                         if (replace && nhn)
4475                                 ip6_print_replace_route_err(&rt6_nh_list);
4476                         err_nh = nh;
4477                         goto add_errout;
4478                 }
4479
4480                 /* Because each route is added like a single route we remove
4481                  * these flags after the first nexthop: if there is a collision,
4482                  * we have already failed to add the first nexthop:
4483                  * fib6_add_rt2node() has rejected it; when replacing, old
4484                  * nexthops have been replaced by first new, the rest should
4485                  * be added to it.
4486                  */
4487                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4488                                                      NLM_F_REPLACE);
4489                 nhn++;
4490         }
4491
4492         /* success ... tell user about new route */
4493         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4494         goto cleanup;
4495
4496 add_errout:
4497         /* send notification for routes that were added so that
4498          * the delete notifications sent by ip6_route_del are
4499          * coherent
4500          */
4501         if (rt_notif)
4502                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4503
4504         /* Delete routes that were already added */
4505         list_for_each_entry(nh, &rt6_nh_list, next) {
4506                 if (err_nh == nh)
4507                         break;
4508                 ip6_route_del(&nh->r_cfg, extack);
4509         }
4510
4511 cleanup:
4512         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4513                 if (nh->fib6_info)
4514                         fib6_info_release(nh->fib6_info);
4515                 list_del(&nh->next);
4516                 kfree(nh);
4517         }
4518
4519         return err;
4520 }
4521
4522 static int ip6_route_multipath_del(struct fib6_config *cfg,
4523                                    struct netlink_ext_ack *extack)
4524 {
4525         struct fib6_config r_cfg;
4526         struct rtnexthop *rtnh;
4527         int remaining;
4528         int attrlen;
4529         int err = 1, last_err = 0;
4530
4531         remaining = cfg->fc_mp_len;
4532         rtnh = (struct rtnexthop *)cfg->fc_mp;
4533
4534         /* Parse a Multipath Entry */
4535         while (rtnh_ok(rtnh, remaining)) {
4536                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4537                 if (rtnh->rtnh_ifindex)
4538                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4539
4540                 attrlen = rtnh_attrlen(rtnh);
4541                 if (attrlen > 0) {
4542                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4543
4544                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4545                         if (nla) {
4546                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4547                                 r_cfg.fc_flags |= RTF_GATEWAY;
4548                         }
4549                 }
4550                 err = ip6_route_del(&r_cfg, extack);
4551                 if (err)
4552                         last_err = err;
4553
4554                 rtnh = rtnh_next(rtnh, &remaining);
4555         }
4556
4557         return last_err;
4558 }
4559
4560 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4561                               struct netlink_ext_ack *extack)
4562 {
4563         struct fib6_config cfg;
4564         int err;
4565
4566         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4567         if (err < 0)
4568                 return err;
4569
4570         if (cfg.fc_mp)
4571                 return ip6_route_multipath_del(&cfg, extack);
4572         else {
4573                 cfg.fc_delete_all_nh = 1;
4574                 return ip6_route_del(&cfg, extack);
4575         }
4576 }
4577
4578 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4579                               struct netlink_ext_ack *extack)
4580 {
4581         struct fib6_config cfg;
4582         int err;
4583
4584         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4585         if (err < 0)
4586                 return err;
4587
4588         if (cfg.fc_mp)
4589                 return ip6_route_multipath_add(&cfg, extack);
4590         else
4591                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4592 }
4593
4594 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4595 {
4596         int nexthop_len = 0;
4597
4598         if (rt->fib6_nsiblings) {
4599                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4600                             + NLA_ALIGN(sizeof(struct rtnexthop))
4601                             + nla_total_size(16) /* RTA_GATEWAY */
4602                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4603
4604                 nexthop_len *= rt->fib6_nsiblings;
4605         }
4606
4607         return NLMSG_ALIGN(sizeof(struct rtmsg))
4608                + nla_total_size(16) /* RTA_SRC */
4609                + nla_total_size(16) /* RTA_DST */
4610                + nla_total_size(16) /* RTA_GATEWAY */
4611                + nla_total_size(16) /* RTA_PREFSRC */
4612                + nla_total_size(4) /* RTA_TABLE */
4613                + nla_total_size(4) /* RTA_IIF */
4614                + nla_total_size(4) /* RTA_OIF */
4615                + nla_total_size(4) /* RTA_PRIORITY */
4616                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4617                + nla_total_size(sizeof(struct rta_cacheinfo))
4618                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4619                + nla_total_size(1) /* RTA_PREF */
4620                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4621                + nexthop_len;
4622 }
4623
4624 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4625                             unsigned int *flags, bool skip_oif)
4626 {
4627         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4628                 *flags |= RTNH_F_DEAD;
4629
4630         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4631                 *flags |= RTNH_F_LINKDOWN;
4632
4633                 rcu_read_lock();
4634                 if (fib6_ignore_linkdown(rt))
4635                         *flags |= RTNH_F_DEAD;
4636                 rcu_read_unlock();
4637         }
4638
4639         if (rt->fib6_flags & RTF_GATEWAY) {
4640                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4641                         goto nla_put_failure;
4642         }
4643
4644         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4645         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4646                 *flags |= RTNH_F_OFFLOAD;
4647
4648         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4649         if (!skip_oif && rt->fib6_nh.nh_dev &&
4650             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4651                 goto nla_put_failure;
4652
4653         if (rt->fib6_nh.nh_lwtstate &&
4654             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4655                 goto nla_put_failure;
4656
4657         return 0;
4658
4659 nla_put_failure:
4660         return -EMSGSIZE;
4661 }
4662
4663 /* add multipath next hop */
4664 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4665 {
4666         const struct net_device *dev = rt->fib6_nh.nh_dev;
4667         struct rtnexthop *rtnh;
4668         unsigned int flags = 0;
4669
4670         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4671         if (!rtnh)
4672                 goto nla_put_failure;
4673
4674         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4675         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4676
4677         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4678                 goto nla_put_failure;
4679
4680         rtnh->rtnh_flags = flags;
4681
4682         /* length of rtnetlink header + attributes */
4683         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4684
4685         return 0;
4686
4687 nla_put_failure:
4688         return -EMSGSIZE;
4689 }
4690
4691 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4692                          struct fib6_info *rt, struct dst_entry *dst,
4693                          struct in6_addr *dest, struct in6_addr *src,
4694                          int iif, int type, u32 portid, u32 seq,
4695                          unsigned int flags)
4696 {
4697         struct rt6_info *rt6 = (struct rt6_info *)dst;
4698         struct rt6key *rt6_dst, *rt6_src;
4699         u32 *pmetrics, table, rt6_flags;
4700         struct nlmsghdr *nlh;
4701         struct rtmsg *rtm;
4702         long expires = 0;
4703
4704         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4705         if (!nlh)
4706                 return -EMSGSIZE;
4707
4708         if (rt6) {
4709                 rt6_dst = &rt6->rt6i_dst;
4710                 rt6_src = &rt6->rt6i_src;
4711                 rt6_flags = rt6->rt6i_flags;
4712         } else {
4713                 rt6_dst = &rt->fib6_dst;
4714                 rt6_src = &rt->fib6_src;
4715                 rt6_flags = rt->fib6_flags;
4716         }
4717
4718         rtm = nlmsg_data(nlh);
4719         rtm->rtm_family = AF_INET6;
4720         rtm->rtm_dst_len = rt6_dst->plen;
4721         rtm->rtm_src_len = rt6_src->plen;
4722         rtm->rtm_tos = 0;
4723         if (rt->fib6_table)
4724                 table = rt->fib6_table->tb6_id;
4725         else
4726                 table = RT6_TABLE_UNSPEC;
4727         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4728         if (nla_put_u32(skb, RTA_TABLE, table))
4729                 goto nla_put_failure;
4730
4731         rtm->rtm_type = rt->fib6_type;
4732         rtm->rtm_flags = 0;
4733         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4734         rtm->rtm_protocol = rt->fib6_protocol;
4735
4736         if (rt6_flags & RTF_CACHE)
4737                 rtm->rtm_flags |= RTM_F_CLONED;
4738
4739         if (dest) {
4740                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4741                         goto nla_put_failure;
4742                 rtm->rtm_dst_len = 128;
4743         } else if (rtm->rtm_dst_len)
4744                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4745                         goto nla_put_failure;
4746 #ifdef CONFIG_IPV6_SUBTREES
4747         if (src) {
4748                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4749                         goto nla_put_failure;
4750                 rtm->rtm_src_len = 128;
4751         } else if (rtm->rtm_src_len &&
4752                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4753                 goto nla_put_failure;
4754 #endif
4755         if (iif) {
4756 #ifdef CONFIG_IPV6_MROUTE
4757                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4758                         int err = ip6mr_get_route(net, skb, rtm, portid);
4759
4760                         if (err == 0)
4761                                 return 0;
4762                         if (err < 0)
4763                                 goto nla_put_failure;
4764                 } else
4765 #endif
4766                         if (nla_put_u32(skb, RTA_IIF, iif))
4767                                 goto nla_put_failure;
4768         } else if (dest) {
4769                 struct in6_addr saddr_buf;
4770                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4771                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4772                         goto nla_put_failure;
4773         }
4774
4775         if (rt->fib6_prefsrc.plen) {
4776                 struct in6_addr saddr_buf;
4777                 saddr_buf = rt->fib6_prefsrc.addr;
4778                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4779                         goto nla_put_failure;
4780         }
4781
4782         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4783         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4784                 goto nla_put_failure;
4785
4786         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4787                 goto nla_put_failure;
4788
4789         /* For multipath routes, walk the siblings list and add
4790          * each as a nexthop within RTA_MULTIPATH.
4791          */
4792         if (rt6) {
4793                 if (rt6_flags & RTF_GATEWAY &&
4794                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4795                         goto nla_put_failure;
4796
4797                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4798                         goto nla_put_failure;
4799         } else if (rt->fib6_nsiblings) {
4800                 struct fib6_info *sibling, *next_sibling;
4801                 struct nlattr *mp;
4802
4803                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4804                 if (!mp)
4805                         goto nla_put_failure;
4806
4807                 if (rt6_add_nexthop(skb, rt) < 0)
4808                         goto nla_put_failure;
4809
4810                 list_for_each_entry_safe(sibling, next_sibling,
4811                                          &rt->fib6_siblings, fib6_siblings) {
4812                         if (rt6_add_nexthop(skb, sibling) < 0)
4813                                 goto nla_put_failure;
4814                 }
4815
4816                 nla_nest_end(skb, mp);
4817         } else {
4818                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4819                         goto nla_put_failure;
4820         }
4821
4822         if (rt6_flags & RTF_EXPIRES) {
4823                 expires = dst ? dst->expires : rt->expires;
4824                 expires -= jiffies;
4825         }
4826
4827         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4828                 goto nla_put_failure;
4829
4830         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4831                 goto nla_put_failure;
4832
4833
4834         nlmsg_end(skb, nlh);
4835         return 0;
4836
4837 nla_put_failure:
4838         nlmsg_cancel(skb, nlh);
4839         return -EMSGSIZE;
4840 }
4841
4842 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4843 {
4844         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4845         struct net *net = arg->net;
4846
4847         if (rt == net->ipv6.fib6_null_entry)
4848                 return 0;
4849
4850         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4851                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4852
4853                 /* user wants prefix routes only */
4854                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4855                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4856                         /* success since this is not a prefix route */
4857                         return 1;
4858                 }
4859         }
4860
4861         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4862                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4863                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4864 }
4865
4866 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4867                               struct netlink_ext_ack *extack)
4868 {
4869         struct net *net = sock_net(in_skb->sk);
4870         struct nlattr *tb[RTA_MAX+1];
4871         int err, iif = 0, oif = 0;
4872         struct fib6_info *from;
4873         struct dst_entry *dst;
4874         struct rt6_info *rt;
4875         struct sk_buff *skb;
4876         struct rtmsg *rtm;
4877         struct flowi6 fl6;
4878         bool fibmatch;
4879
4880         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4881                           extack);
4882         if (err < 0)
4883                 goto errout;
4884
4885         err = -EINVAL;
4886         memset(&fl6, 0, sizeof(fl6));
4887         rtm = nlmsg_data(nlh);
4888         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4889         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4890
4891         if (tb[RTA_SRC]) {
4892                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4893                         goto errout;
4894
4895                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4896         }
4897
4898         if (tb[RTA_DST]) {
4899                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4900                         goto errout;
4901
4902                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4903         }
4904
4905         if (tb[RTA_IIF])
4906                 iif = nla_get_u32(tb[RTA_IIF]);
4907
4908         if (tb[RTA_OIF])
4909                 oif = nla_get_u32(tb[RTA_OIF]);
4910
4911         if (tb[RTA_MARK])
4912                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4913
4914         if (tb[RTA_UID])
4915                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4916                                            nla_get_u32(tb[RTA_UID]));
4917         else
4918                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4919
4920         if (tb[RTA_SPORT])
4921                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4922
4923         if (tb[RTA_DPORT])
4924                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4925
4926         if (tb[RTA_IP_PROTO]) {
4927                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4928                                                   &fl6.flowi6_proto, AF_INET6,
4929                                                   extack);
4930                 if (err)
4931                         goto errout;
4932         }
4933
4934         if (iif) {
4935                 struct net_device *dev;
4936                 int flags = 0;
4937
4938                 rcu_read_lock();
4939
4940                 dev = dev_get_by_index_rcu(net, iif);
4941                 if (!dev) {
4942                         rcu_read_unlock();
4943                         err = -ENODEV;
4944                         goto errout;
4945                 }
4946
4947                 fl6.flowi6_iif = iif;
4948
4949                 if (!ipv6_addr_any(&fl6.saddr))
4950                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4951
4952                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4953
4954                 rcu_read_unlock();
4955         } else {
4956                 fl6.flowi6_oif = oif;
4957
4958                 dst = ip6_route_output(net, NULL, &fl6);
4959         }
4960
4961
4962         rt = container_of(dst, struct rt6_info, dst);
4963         if (rt->dst.error) {
4964                 err = rt->dst.error;
4965                 ip6_rt_put(rt);
4966                 goto errout;
4967         }
4968
4969         if (rt == net->ipv6.ip6_null_entry) {
4970                 err = rt->dst.error;
4971                 ip6_rt_put(rt);
4972                 goto errout;
4973         }
4974
4975         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4976         if (!skb) {
4977                 ip6_rt_put(rt);
4978                 err = -ENOBUFS;
4979                 goto errout;
4980         }
4981
4982         skb_dst_set(skb, &rt->dst);
4983
4984         rcu_read_lock();
4985         from = rcu_dereference(rt->from);
4986         if (from) {
4987                 if (fibmatch)
4988                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
4989                                             iif, RTM_NEWROUTE,
4990                                             NETLINK_CB(in_skb).portid,
4991                                             nlh->nlmsg_seq, 0);
4992                 else
4993                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4994                                             &fl6.saddr, iif, RTM_NEWROUTE,
4995                                             NETLINK_CB(in_skb).portid,
4996                                             nlh->nlmsg_seq, 0);
4997         } else {
4998                 err = -ENETUNREACH;
4999         }
5000         rcu_read_unlock();
5001
5002         if (err < 0) {
5003                 kfree_skb(skb);
5004                 goto errout;
5005         }
5006
5007         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5008 errout:
5009         return err;
5010 }
5011
5012 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5013                      unsigned int nlm_flags)
5014 {
5015         struct sk_buff *skb;
5016         struct net *net = info->nl_net;
5017         u32 seq;
5018         int err;
5019
5020         err = -ENOBUFS;
5021         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5022
5023         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5024         if (!skb)
5025                 goto errout;
5026
5027         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5028                             event, info->portid, seq, nlm_flags);
5029         if (err < 0) {
5030                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5031                 WARN_ON(err == -EMSGSIZE);
5032                 kfree_skb(skb);
5033                 goto errout;
5034         }
5035         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5036                     info->nlh, gfp_any());
5037         return;
5038 errout:
5039         if (err < 0)
5040                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5041 }
5042
5043 static int ip6_route_dev_notify(struct notifier_block *this,
5044                                 unsigned long event, void *ptr)
5045 {
5046         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5047         struct net *net = dev_net(dev);
5048
5049         if (!(dev->flags & IFF_LOOPBACK))
5050                 return NOTIFY_OK;
5051
5052         if (event == NETDEV_REGISTER) {
5053                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5054                 net->ipv6.ip6_null_entry->dst.dev = dev;
5055                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5056 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5057                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5058                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5059                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5060                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5061 #endif
5062          } else if (event == NETDEV_UNREGISTER &&
5063                     dev->reg_state != NETREG_UNREGISTERED) {
5064                 /* NETDEV_UNREGISTER could be fired for multiple times by
5065                  * netdev_wait_allrefs(). Make sure we only call this once.
5066                  */
5067                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5068 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5069                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5070                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5071 #endif
5072         }
5073
5074         return NOTIFY_OK;
5075 }
5076
5077 /*
5078  *      /proc
5079  */
5080
5081 #ifdef CONFIG_PROC_FS
5082 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5083 {
5084         struct net *net = (struct net *)seq->private;
5085         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5086                    net->ipv6.rt6_stats->fib_nodes,
5087                    net->ipv6.rt6_stats->fib_route_nodes,
5088                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5089                    net->ipv6.rt6_stats->fib_rt_entries,
5090                    net->ipv6.rt6_stats->fib_rt_cache,
5091                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5092                    net->ipv6.rt6_stats->fib_discarded_routes);
5093
5094         return 0;
5095 }
5096 #endif  /* CONFIG_PROC_FS */
5097
5098 #ifdef CONFIG_SYSCTL
5099
5100 static
5101 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5102                               void __user *buffer, size_t *lenp, loff_t *ppos)
5103 {
5104         struct net *net;
5105         int delay;
5106         if (!write)
5107                 return -EINVAL;
5108
5109         net = (struct net *)ctl->extra1;
5110         delay = net->ipv6.sysctl.flush_delay;
5111         proc_dointvec(ctl, write, buffer, lenp, ppos);
5112         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5113         return 0;
5114 }
5115
5116 struct ctl_table ipv6_route_table_template[] = {
5117         {
5118                 .procname       =       "flush",
5119                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5120                 .maxlen         =       sizeof(int),
5121                 .mode           =       0200,
5122                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5123         },
5124         {
5125                 .procname       =       "gc_thresh",
5126                 .data           =       &ip6_dst_ops_template.gc_thresh,
5127                 .maxlen         =       sizeof(int),
5128                 .mode           =       0644,
5129                 .proc_handler   =       proc_dointvec,
5130         },
5131         {
5132                 .procname       =       "max_size",
5133                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5134                 .maxlen         =       sizeof(int),
5135                 .mode           =       0644,
5136                 .proc_handler   =       proc_dointvec,
5137         },
5138         {
5139                 .procname       =       "gc_min_interval",
5140                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5141                 .maxlen         =       sizeof(int),
5142                 .mode           =       0644,
5143                 .proc_handler   =       proc_dointvec_jiffies,
5144         },
5145         {
5146                 .procname       =       "gc_timeout",
5147                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5148                 .maxlen         =       sizeof(int),
5149                 .mode           =       0644,
5150                 .proc_handler   =       proc_dointvec_jiffies,
5151         },
5152         {
5153                 .procname       =       "gc_interval",
5154                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5155                 .maxlen         =       sizeof(int),
5156                 .mode           =       0644,
5157                 .proc_handler   =       proc_dointvec_jiffies,
5158         },
5159         {
5160                 .procname       =       "gc_elasticity",
5161                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5162                 .maxlen         =       sizeof(int),
5163                 .mode           =       0644,
5164                 .proc_handler   =       proc_dointvec,
5165         },
5166         {
5167                 .procname       =       "mtu_expires",
5168                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5169                 .maxlen         =       sizeof(int),
5170                 .mode           =       0644,
5171                 .proc_handler   =       proc_dointvec_jiffies,
5172         },
5173         {
5174                 .procname       =       "min_adv_mss",
5175                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5176                 .maxlen         =       sizeof(int),
5177                 .mode           =       0644,
5178                 .proc_handler   =       proc_dointvec,
5179         },
5180         {
5181                 .procname       =       "gc_min_interval_ms",
5182                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5183                 .maxlen         =       sizeof(int),
5184                 .mode           =       0644,
5185                 .proc_handler   =       proc_dointvec_ms_jiffies,
5186         },
5187         { }
5188 };
5189
5190 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5191 {
5192         struct ctl_table *table;
5193
5194         table = kmemdup(ipv6_route_table_template,
5195                         sizeof(ipv6_route_table_template),
5196                         GFP_KERNEL);
5197
5198         if (table) {
5199                 table[0].data = &net->ipv6.sysctl.flush_delay;
5200                 table[0].extra1 = net;
5201                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5202                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5203                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5204                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5205                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5206                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5207                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5208                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5209                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5210
5211                 /* Don't export sysctls to unprivileged users */
5212                 if (net->user_ns != &init_user_ns)
5213                         table[0].procname = NULL;
5214         }
5215
5216         return table;
5217 }
5218 #endif
5219
5220 static int __net_init ip6_route_net_init(struct net *net)
5221 {
5222         int ret = -ENOMEM;
5223
5224         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5225                sizeof(net->ipv6.ip6_dst_ops));
5226
5227         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5228                 goto out_ip6_dst_ops;
5229
5230         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5231                                             sizeof(*net->ipv6.fib6_null_entry),
5232                                             GFP_KERNEL);
5233         if (!net->ipv6.fib6_null_entry)
5234                 goto out_ip6_dst_entries;
5235
5236         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5237                                            sizeof(*net->ipv6.ip6_null_entry),
5238                                            GFP_KERNEL);
5239         if (!net->ipv6.ip6_null_entry)
5240                 goto out_fib6_null_entry;
5241         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5242         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5243                          ip6_template_metrics, true);
5244
5245 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5246         net->ipv6.fib6_has_custom_rules = false;
5247         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5248                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5249                                                GFP_KERNEL);
5250         if (!net->ipv6.ip6_prohibit_entry)
5251                 goto out_ip6_null_entry;
5252         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5253         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5254                          ip6_template_metrics, true);
5255
5256         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5257                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5258                                                GFP_KERNEL);
5259         if (!net->ipv6.ip6_blk_hole_entry)
5260                 goto out_ip6_prohibit_entry;
5261         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5262         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5263                          ip6_template_metrics, true);
5264 #endif
5265
5266         net->ipv6.sysctl.flush_delay = 0;
5267         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5268         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5269         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5270         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5271         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5272         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5273         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5274
5275         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5276
5277         ret = 0;
5278 out:
5279         return ret;
5280
5281 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5282 out_ip6_prohibit_entry:
5283         kfree(net->ipv6.ip6_prohibit_entry);
5284 out_ip6_null_entry:
5285         kfree(net->ipv6.ip6_null_entry);
5286 #endif
5287 out_fib6_null_entry:
5288         kfree(net->ipv6.fib6_null_entry);
5289 out_ip6_dst_entries:
5290         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5291 out_ip6_dst_ops:
5292         goto out;
5293 }
5294
5295 static void __net_exit ip6_route_net_exit(struct net *net)
5296 {
5297         kfree(net->ipv6.fib6_null_entry);
5298         kfree(net->ipv6.ip6_null_entry);
5299 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5300         kfree(net->ipv6.ip6_prohibit_entry);
5301         kfree(net->ipv6.ip6_blk_hole_entry);
5302 #endif
5303         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5304 }
5305
5306 static int __net_init ip6_route_net_init_late(struct net *net)
5307 {
5308 #ifdef CONFIG_PROC_FS
5309         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5310                         sizeof(struct ipv6_route_iter));
5311         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5312                         rt6_stats_seq_show, NULL);
5313 #endif
5314         return 0;
5315 }
5316
5317 static void __net_exit ip6_route_net_exit_late(struct net *net)
5318 {
5319 #ifdef CONFIG_PROC_FS
5320         remove_proc_entry("ipv6_route", net->proc_net);
5321         remove_proc_entry("rt6_stats", net->proc_net);
5322 #endif
5323 }
5324
5325 static struct pernet_operations ip6_route_net_ops = {
5326         .init = ip6_route_net_init,
5327         .exit = ip6_route_net_exit,
5328 };
5329
5330 static int __net_init ipv6_inetpeer_init(struct net *net)
5331 {
5332         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5333
5334         if (!bp)
5335                 return -ENOMEM;
5336         inet_peer_base_init(bp);
5337         net->ipv6.peers = bp;
5338         return 0;
5339 }
5340
5341 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5342 {
5343         struct inet_peer_base *bp = net->ipv6.peers;
5344
5345         net->ipv6.peers = NULL;
5346         inetpeer_invalidate_tree(bp);
5347         kfree(bp);
5348 }
5349
5350 static struct pernet_operations ipv6_inetpeer_ops = {
5351         .init   =       ipv6_inetpeer_init,
5352         .exit   =       ipv6_inetpeer_exit,
5353 };
5354
5355 static struct pernet_operations ip6_route_net_late_ops = {
5356         .init = ip6_route_net_init_late,
5357         .exit = ip6_route_net_exit_late,
5358 };
5359
5360 static struct notifier_block ip6_route_dev_notifier = {
5361         .notifier_call = ip6_route_dev_notify,
5362         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5363 };
5364
5365 void __init ip6_route_init_special_entries(void)
5366 {
5367         /* Registering of the loopback is done before this portion of code,
5368          * the loopback reference in rt6_info will not be taken, do it
5369          * manually for init_net */
5370         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5371         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5372         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5373   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5374         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5375         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5376         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5377         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5378   #endif
5379 }
5380
5381 int __init ip6_route_init(void)
5382 {
5383         int ret;
5384         int cpu;
5385
5386         ret = -ENOMEM;
5387         ip6_dst_ops_template.kmem_cachep =
5388                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5389                                   SLAB_HWCACHE_ALIGN, NULL);
5390         if (!ip6_dst_ops_template.kmem_cachep)
5391                 goto out;
5392
5393         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5394         if (ret)
5395                 goto out_kmem_cache;
5396
5397         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5398         if (ret)
5399                 goto out_dst_entries;
5400
5401         ret = register_pernet_subsys(&ip6_route_net_ops);
5402         if (ret)
5403                 goto out_register_inetpeer;
5404
5405         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5406
5407         ret = fib6_init();
5408         if (ret)
5409                 goto out_register_subsys;
5410
5411         ret = xfrm6_init();
5412         if (ret)
5413                 goto out_fib6_init;
5414
5415         ret = fib6_rules_init();
5416         if (ret)
5417                 goto xfrm6_init;
5418
5419         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5420         if (ret)
5421                 goto fib6_rules_init;
5422
5423         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5424                                    inet6_rtm_newroute, NULL, 0);
5425         if (ret < 0)
5426                 goto out_register_late_subsys;
5427
5428         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5429                                    inet6_rtm_delroute, NULL, 0);
5430         if (ret < 0)
5431                 goto out_register_late_subsys;
5432
5433         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5434                                    inet6_rtm_getroute, NULL,
5435                                    RTNL_FLAG_DOIT_UNLOCKED);
5436         if (ret < 0)
5437                 goto out_register_late_subsys;
5438
5439         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5440         if (ret)
5441                 goto out_register_late_subsys;
5442
5443         for_each_possible_cpu(cpu) {
5444                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5445
5446                 INIT_LIST_HEAD(&ul->head);
5447                 spin_lock_init(&ul->lock);
5448         }
5449
5450 out:
5451         return ret;
5452
5453 out_register_late_subsys:
5454         rtnl_unregister_all(PF_INET6);
5455         unregister_pernet_subsys(&ip6_route_net_late_ops);
5456 fib6_rules_init:
5457         fib6_rules_cleanup();
5458 xfrm6_init:
5459         xfrm6_fini();
5460 out_fib6_init:
5461         fib6_gc_cleanup();
5462 out_register_subsys:
5463         unregister_pernet_subsys(&ip6_route_net_ops);
5464 out_register_inetpeer:
5465         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5466 out_dst_entries:
5467         dst_entries_destroy(&ip6_dst_blackhole_ops);
5468 out_kmem_cache:
5469         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5470         goto out;
5471 }
5472
5473 void ip6_route_cleanup(void)
5474 {
5475         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5476         unregister_pernet_subsys(&ip6_route_net_late_ops);
5477         fib6_rules_cleanup();
5478         xfrm6_fini();
5479         fib6_gc_cleanup();
5480         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5481         unregister_pernet_subsys(&ip6_route_net_ops);
5482         dst_entries_destroy(&ip6_dst_blackhole_ops);
5483         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5484 }