serial: imx: restore handshaking irq for imx1
[platform/kernel/linux-rpi.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524
525         /*
526          * Okay, this does not seem to be appropriate
527          * for now, however, we need to check if it
528          * is really so; aka Router Reachability Probing.
529          *
530          * Router Reachability Probe MUST be rate-limited
531          * to no more than one per minute.
532          */
533         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534                 return;
535
536         nh_gw = &rt->fib6_nh.nh_gw;
537         dev = rt->fib6_nh.nh_dev;
538         rcu_read_lock_bh();
539         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
540         if (neigh) {
541                 struct inet6_dev *idev;
542
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 idev = __in6_dev_get(dev);
547                 work = NULL;
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else {
558                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559         }
560
561         if (work) {
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         rt->dst.flags |= fib6_info_dst_flags(ort);
950
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998         rt->rt6i_prefsrc = ort->fib6_prefsrc;
999 }
1000
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002                                         struct in6_addr *saddr)
1003 {
1004         struct fib6_node *pn, *sn;
1005         while (1) {
1006                 if (fn->fn_flags & RTN_TL_ROOT)
1007                         return NULL;
1008                 pn = rcu_dereference(fn->parent);
1009                 sn = FIB6_SUBTREE(pn);
1010                 if (sn && sn != fn)
1011                         fn = fib6_node_lookup(sn, NULL, saddr);
1012                 else
1013                         fn = pn;
1014                 if (fn->fn_flags & RTN_RTINFO)
1015                         return fn;
1016         }
1017 }
1018
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1020                           bool null_fallback)
1021 {
1022         struct rt6_info *rt = *prt;
1023
1024         if (dst_hold_safe(&rt->dst))
1025                 return true;
1026         if (null_fallback) {
1027                 rt = net->ipv6.ip6_null_entry;
1028                 dst_hold(&rt->dst);
1029         } else {
1030                 rt = NULL;
1031         }
1032         *prt = rt;
1033         return false;
1034 }
1035
1036 /* called with rcu_lock held */
1037 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1038 {
1039         unsigned short flags = fib6_info_dst_flags(rt);
1040         struct net_device *dev = rt->fib6_nh.nh_dev;
1041         struct rt6_info *nrt;
1042
1043         if (!fib6_info_hold_safe(rt))
1044                 return NULL;
1045
1046         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1047         if (nrt)
1048                 ip6_rt_copy_init(nrt, rt);
1049         else
1050                 fib6_info_release(rt);
1051
1052         return nrt;
1053 }
1054
1055 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1056                                              struct fib6_table *table,
1057                                              struct flowi6 *fl6,
1058                                              const struct sk_buff *skb,
1059                                              int flags)
1060 {
1061         struct fib6_info *f6i;
1062         struct fib6_node *fn;
1063         struct rt6_info *rt;
1064
1065         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1066                 flags &= ~RT6_LOOKUP_F_IFACE;
1067
1068         rcu_read_lock();
1069         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1070 restart:
1071         f6i = rcu_dereference(fn->leaf);
1072         if (!f6i) {
1073                 f6i = net->ipv6.fib6_null_entry;
1074         } else {
1075                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1076                                       fl6->flowi6_oif, flags);
1077                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1078                         f6i = fib6_multipath_select(net, f6i, fl6,
1079                                                     fl6->flowi6_oif, skb,
1080                                                     flags);
1081         }
1082         if (f6i == net->ipv6.fib6_null_entry) {
1083                 fn = fib6_backtrack(fn, &fl6->saddr);
1084                 if (fn)
1085                         goto restart;
1086         }
1087
1088         trace_fib6_table_lookup(net, f6i, table, fl6);
1089
1090         /* Search through exception table */
1091         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1092         if (rt) {
1093                 if (ip6_hold_safe(net, &rt, true))
1094                         dst_use_noref(&rt->dst, jiffies);
1095         } else if (f6i == net->ipv6.fib6_null_entry) {
1096                 rt = net->ipv6.ip6_null_entry;
1097                 dst_hold(&rt->dst);
1098         } else {
1099                 rt = ip6_create_rt_rcu(f6i);
1100                 if (!rt) {
1101                         rt = net->ipv6.ip6_null_entry;
1102                         dst_hold(&rt->dst);
1103                 }
1104         }
1105
1106         rcu_read_unlock();
1107
1108         return rt;
1109 }
1110
1111 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1112                                    const struct sk_buff *skb, int flags)
1113 {
1114         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 }
1116 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117
1118 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1119                             const struct in6_addr *saddr, int oif,
1120                             const struct sk_buff *skb, int strict)
1121 {
1122         struct flowi6 fl6 = {
1123                 .flowi6_oif = oif,
1124                 .daddr = *daddr,
1125         };
1126         struct dst_entry *dst;
1127         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1128
1129         if (saddr) {
1130                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1131                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1132         }
1133
1134         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1135         if (dst->error == 0)
1136                 return (struct rt6_info *) dst;
1137
1138         dst_release(dst);
1139
1140         return NULL;
1141 }
1142 EXPORT_SYMBOL(rt6_lookup);
1143
1144 /* ip6_ins_rt is called with FREE table->tb6_lock.
1145  * It takes new route entry, the addition fails by any reason the
1146  * route is released.
1147  * Caller must hold dst before calling it.
1148  */
1149
1150 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1151                         struct netlink_ext_ack *extack)
1152 {
1153         int err;
1154         struct fib6_table *table;
1155
1156         table = rt->fib6_table;
1157         spin_lock_bh(&table->tb6_lock);
1158         err = fib6_add(&table->tb6_root, rt, info, extack);
1159         spin_unlock_bh(&table->tb6_lock);
1160
1161         return err;
1162 }
1163
1164 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 {
1166         struct nl_info info = { .nl_net = net, };
1167
1168         return __ip6_ins_rt(rt, &info, NULL);
1169 }
1170
1171 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1172                                            const struct in6_addr *daddr,
1173                                            const struct in6_addr *saddr)
1174 {
1175         struct net_device *dev;
1176         struct rt6_info *rt;
1177
1178         /*
1179          *      Clone the route.
1180          */
1181
1182         if (!fib6_info_hold_safe(ort))
1183                 return NULL;
1184
1185         dev = ip6_rt_get_dev_rcu(ort);
1186         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1187         if (!rt) {
1188                 fib6_info_release(ort);
1189                 return NULL;
1190         }
1191
1192         ip6_rt_copy_init(rt, ort);
1193         rt->rt6i_flags |= RTF_CACHE;
1194         rt->dst.flags |= DST_HOST;
1195         rt->rt6i_dst.addr = *daddr;
1196         rt->rt6i_dst.plen = 128;
1197
1198         if (!rt6_is_gw_or_nonexthop(ort)) {
1199                 if (ort->fib6_dst.plen != 128 &&
1200                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1201                         rt->rt6i_flags |= RTF_ANYCAST;
1202 #ifdef CONFIG_IPV6_SUBTREES
1203                 if (rt->rt6i_src.plen && saddr) {
1204                         rt->rt6i_src.addr = *saddr;
1205                         rt->rt6i_src.plen = 128;
1206                 }
1207 #endif
1208         }
1209
1210         return rt;
1211 }
1212
1213 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1214 {
1215         unsigned short flags = fib6_info_dst_flags(rt);
1216         struct net_device *dev;
1217         struct rt6_info *pcpu_rt;
1218
1219         if (!fib6_info_hold_safe(rt))
1220                 return NULL;
1221
1222         rcu_read_lock();
1223         dev = ip6_rt_get_dev_rcu(rt);
1224         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1225         rcu_read_unlock();
1226         if (!pcpu_rt) {
1227                 fib6_info_release(rt);
1228                 return NULL;
1229         }
1230         ip6_rt_copy_init(pcpu_rt, rt);
1231         pcpu_rt->rt6i_flags |= RTF_PCPU;
1232         return pcpu_rt;
1233 }
1234
1235 /* It should be called with rcu_read_lock() acquired */
1236 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1237 {
1238         struct rt6_info *pcpu_rt, **p;
1239
1240         p = this_cpu_ptr(rt->rt6i_pcpu);
1241         pcpu_rt = *p;
1242
1243         if (pcpu_rt)
1244                 ip6_hold_safe(NULL, &pcpu_rt, false);
1245
1246         return pcpu_rt;
1247 }
1248
1249 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1250                                             struct fib6_info *rt)
1251 {
1252         struct rt6_info *pcpu_rt, *prev, **p;
1253
1254         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1255         if (!pcpu_rt) {
1256                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1257                 return net->ipv6.ip6_null_entry;
1258         }
1259
1260         dst_hold(&pcpu_rt->dst);
1261         p = this_cpu_ptr(rt->rt6i_pcpu);
1262         prev = cmpxchg(p, NULL, pcpu_rt);
1263         BUG_ON(prev);
1264
1265         return pcpu_rt;
1266 }
1267
1268 /* exception hash table implementation
1269  */
1270 static DEFINE_SPINLOCK(rt6_exception_lock);
1271
1272 /* Remove rt6_ex from hash table and free the memory
1273  * Caller must hold rt6_exception_lock
1274  */
1275 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1276                                  struct rt6_exception *rt6_ex)
1277 {
1278         struct net *net;
1279
1280         if (!bucket || !rt6_ex)
1281                 return;
1282
1283         net = dev_net(rt6_ex->rt6i->dst.dev);
1284         hlist_del_rcu(&rt6_ex->hlist);
1285         dst_release(&rt6_ex->rt6i->dst);
1286         kfree_rcu(rt6_ex, rcu);
1287         WARN_ON_ONCE(!bucket->depth);
1288         bucket->depth--;
1289         net->ipv6.rt6_stats->fib_rt_cache--;
1290 }
1291
1292 /* Remove oldest rt6_ex in bucket and free the memory
1293  * Caller must hold rt6_exception_lock
1294  */
1295 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1296 {
1297         struct rt6_exception *rt6_ex, *oldest = NULL;
1298
1299         if (!bucket)
1300                 return;
1301
1302         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1303                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1304                         oldest = rt6_ex;
1305         }
1306         rt6_remove_exception(bucket, oldest);
1307 }
1308
1309 static u32 rt6_exception_hash(const struct in6_addr *dst,
1310                               const struct in6_addr *src)
1311 {
1312         static u32 seed __read_mostly;
1313         u32 val;
1314
1315         net_get_random_once(&seed, sizeof(seed));
1316         val = jhash(dst, sizeof(*dst), seed);
1317
1318 #ifdef CONFIG_IPV6_SUBTREES
1319         if (src)
1320                 val = jhash(src, sizeof(*src), val);
1321 #endif
1322         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1323 }
1324
1325 /* Helper function to find the cached rt in the hash table
1326  * and update bucket pointer to point to the bucket for this
1327  * (daddr, saddr) pair
1328  * Caller must hold rt6_exception_lock
1329  */
1330 static struct rt6_exception *
1331 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1332                               const struct in6_addr *daddr,
1333                               const struct in6_addr *saddr)
1334 {
1335         struct rt6_exception *rt6_ex;
1336         u32 hval;
1337
1338         if (!(*bucket) || !daddr)
1339                 return NULL;
1340
1341         hval = rt6_exception_hash(daddr, saddr);
1342         *bucket += hval;
1343
1344         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1345                 struct rt6_info *rt6 = rt6_ex->rt6i;
1346                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1347
1348 #ifdef CONFIG_IPV6_SUBTREES
1349                 if (matched && saddr)
1350                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1351 #endif
1352                 if (matched)
1353                         return rt6_ex;
1354         }
1355         return NULL;
1356 }
1357
1358 /* Helper function to find the cached rt in the hash table
1359  * and update bucket pointer to point to the bucket for this
1360  * (daddr, saddr) pair
1361  * Caller must hold rcu_read_lock()
1362  */
1363 static struct rt6_exception *
1364 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1365                          const struct in6_addr *daddr,
1366                          const struct in6_addr *saddr)
1367 {
1368         struct rt6_exception *rt6_ex;
1369         u32 hval;
1370
1371         WARN_ON_ONCE(!rcu_read_lock_held());
1372
1373         if (!(*bucket) || !daddr)
1374                 return NULL;
1375
1376         hval = rt6_exception_hash(daddr, saddr);
1377         *bucket += hval;
1378
1379         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1380                 struct rt6_info *rt6 = rt6_ex->rt6i;
1381                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1382
1383 #ifdef CONFIG_IPV6_SUBTREES
1384                 if (matched && saddr)
1385                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1386 #endif
1387                 if (matched)
1388                         return rt6_ex;
1389         }
1390         return NULL;
1391 }
1392
1393 static unsigned int fib6_mtu(const struct fib6_info *rt)
1394 {
1395         unsigned int mtu;
1396
1397         if (rt->fib6_pmtu) {
1398                 mtu = rt->fib6_pmtu;
1399         } else {
1400                 struct net_device *dev = fib6_info_nh_dev(rt);
1401                 struct inet6_dev *idev;
1402
1403                 rcu_read_lock();
1404                 idev = __in6_dev_get(dev);
1405                 mtu = idev->cnf.mtu6;
1406                 rcu_read_unlock();
1407         }
1408
1409         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1410
1411         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1412 }
1413
1414 static int rt6_insert_exception(struct rt6_info *nrt,
1415                                 struct fib6_info *ort)
1416 {
1417         struct net *net = dev_net(nrt->dst.dev);
1418         struct rt6_exception_bucket *bucket;
1419         struct in6_addr *src_key = NULL;
1420         struct rt6_exception *rt6_ex;
1421         int err = 0;
1422
1423         spin_lock_bh(&rt6_exception_lock);
1424
1425         if (ort->exception_bucket_flushed) {
1426                 err = -EINVAL;
1427                 goto out;
1428         }
1429
1430         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1431                                         lockdep_is_held(&rt6_exception_lock));
1432         if (!bucket) {
1433                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1434                                  GFP_ATOMIC);
1435                 if (!bucket) {
1436                         err = -ENOMEM;
1437                         goto out;
1438                 }
1439                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1440         }
1441
1442 #ifdef CONFIG_IPV6_SUBTREES
1443         /* rt6i_src.plen != 0 indicates ort is in subtree
1444          * and exception table is indexed by a hash of
1445          * both rt6i_dst and rt6i_src.
1446          * Otherwise, the exception table is indexed by
1447          * a hash of only rt6i_dst.
1448          */
1449         if (ort->fib6_src.plen)
1450                 src_key = &nrt->rt6i_src.addr;
1451 #endif
1452
1453         /* Update rt6i_prefsrc as it could be changed
1454          * in rt6_remove_prefsrc()
1455          */
1456         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1457         /* rt6_mtu_change() might lower mtu on ort.
1458          * Only insert this exception route if its mtu
1459          * is less than ort's mtu value.
1460          */
1461         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1462                 err = -EINVAL;
1463                 goto out;
1464         }
1465
1466         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1467                                                src_key);
1468         if (rt6_ex)
1469                 rt6_remove_exception(bucket, rt6_ex);
1470
1471         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1472         if (!rt6_ex) {
1473                 err = -ENOMEM;
1474                 goto out;
1475         }
1476         rt6_ex->rt6i = nrt;
1477         rt6_ex->stamp = jiffies;
1478         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1479         bucket->depth++;
1480         net->ipv6.rt6_stats->fib_rt_cache++;
1481
1482         if (bucket->depth > FIB6_MAX_DEPTH)
1483                 rt6_exception_remove_oldest(bucket);
1484
1485 out:
1486         spin_unlock_bh(&rt6_exception_lock);
1487
1488         /* Update fn->fn_sernum to invalidate all cached dst */
1489         if (!err) {
1490                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1491                 fib6_update_sernum(net, ort);
1492                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1493                 fib6_force_start_gc(net);
1494         }
1495
1496         return err;
1497 }
1498
1499 void rt6_flush_exceptions(struct fib6_info *rt)
1500 {
1501         struct rt6_exception_bucket *bucket;
1502         struct rt6_exception *rt6_ex;
1503         struct hlist_node *tmp;
1504         int i;
1505
1506         spin_lock_bh(&rt6_exception_lock);
1507         /* Prevent rt6_insert_exception() to recreate the bucket list */
1508         rt->exception_bucket_flushed = 1;
1509
1510         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1511                                     lockdep_is_held(&rt6_exception_lock));
1512         if (!bucket)
1513                 goto out;
1514
1515         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1516                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1517                         rt6_remove_exception(bucket, rt6_ex);
1518                 WARN_ON_ONCE(bucket->depth);
1519                 bucket++;
1520         }
1521
1522 out:
1523         spin_unlock_bh(&rt6_exception_lock);
1524 }
1525
1526 /* Find cached rt in the hash table inside passed in rt
1527  * Caller has to hold rcu_read_lock()
1528  */
1529 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1530                                            struct in6_addr *daddr,
1531                                            struct in6_addr *saddr)
1532 {
1533         struct rt6_exception_bucket *bucket;
1534         struct in6_addr *src_key = NULL;
1535         struct rt6_exception *rt6_ex;
1536         struct rt6_info *res = NULL;
1537
1538         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1539
1540 #ifdef CONFIG_IPV6_SUBTREES
1541         /* rt6i_src.plen != 0 indicates rt is in subtree
1542          * and exception table is indexed by a hash of
1543          * both rt6i_dst and rt6i_src.
1544          * Otherwise, the exception table is indexed by
1545          * a hash of only rt6i_dst.
1546          */
1547         if (rt->fib6_src.plen)
1548                 src_key = saddr;
1549 #endif
1550         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1551
1552         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1553                 res = rt6_ex->rt6i;
1554
1555         return res;
1556 }
1557
1558 /* Remove the passed in cached rt from the hash table that contains it */
1559 static int rt6_remove_exception_rt(struct rt6_info *rt)
1560 {
1561         struct rt6_exception_bucket *bucket;
1562         struct in6_addr *src_key = NULL;
1563         struct rt6_exception *rt6_ex;
1564         struct fib6_info *from;
1565         int err;
1566
1567         from = rcu_dereference(rt->from);
1568         if (!from ||
1569             !(rt->rt6i_flags & RTF_CACHE))
1570                 return -EINVAL;
1571
1572         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1573                 return -ENOENT;
1574
1575         spin_lock_bh(&rt6_exception_lock);
1576         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1577                                     lockdep_is_held(&rt6_exception_lock));
1578 #ifdef CONFIG_IPV6_SUBTREES
1579         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1580          * and exception table is indexed by a hash of
1581          * both rt6i_dst and rt6i_src.
1582          * Otherwise, the exception table is indexed by
1583          * a hash of only rt6i_dst.
1584          */
1585         if (from->fib6_src.plen)
1586                 src_key = &rt->rt6i_src.addr;
1587 #endif
1588         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1589                                                &rt->rt6i_dst.addr,
1590                                                src_key);
1591         if (rt6_ex) {
1592                 rt6_remove_exception(bucket, rt6_ex);
1593                 err = 0;
1594         } else {
1595                 err = -ENOENT;
1596         }
1597
1598         spin_unlock_bh(&rt6_exception_lock);
1599         return err;
1600 }
1601
1602 /* Find rt6_ex which contains the passed in rt cache and
1603  * refresh its stamp
1604  */
1605 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1606 {
1607         struct rt6_exception_bucket *bucket;
1608         struct fib6_info *from = rt->from;
1609         struct in6_addr *src_key = NULL;
1610         struct rt6_exception *rt6_ex;
1611
1612         if (!from ||
1613             !(rt->rt6i_flags & RTF_CACHE))
1614                 return;
1615
1616         rcu_read_lock();
1617         bucket = rcu_dereference(from->rt6i_exception_bucket);
1618
1619 #ifdef CONFIG_IPV6_SUBTREES
1620         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1621          * and exception table is indexed by a hash of
1622          * both rt6i_dst and rt6i_src.
1623          * Otherwise, the exception table is indexed by
1624          * a hash of only rt6i_dst.
1625          */
1626         if (from->fib6_src.plen)
1627                 src_key = &rt->rt6i_src.addr;
1628 #endif
1629         rt6_ex = __rt6_find_exception_rcu(&bucket,
1630                                           &rt->rt6i_dst.addr,
1631                                           src_key);
1632         if (rt6_ex)
1633                 rt6_ex->stamp = jiffies;
1634
1635         rcu_read_unlock();
1636 }
1637
1638 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1639 {
1640         struct rt6_exception_bucket *bucket;
1641         struct rt6_exception *rt6_ex;
1642         int i;
1643
1644         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1645                                         lockdep_is_held(&rt6_exception_lock));
1646
1647         if (bucket) {
1648                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1649                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1650                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1651                         }
1652                         bucket++;
1653                 }
1654         }
1655 }
1656
1657 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1658                                          struct rt6_info *rt, int mtu)
1659 {
1660         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1661          * lowest MTU in the path: always allow updating the route PMTU to
1662          * reflect PMTU decreases.
1663          *
1664          * If the new MTU is higher, and the route PMTU is equal to the local
1665          * MTU, this means the old MTU is the lowest in the path, so allow
1666          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1667          * handle this.
1668          */
1669
1670         if (dst_mtu(&rt->dst) >= mtu)
1671                 return true;
1672
1673         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1674                 return true;
1675
1676         return false;
1677 }
1678
1679 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1680                                        struct fib6_info *rt, int mtu)
1681 {
1682         struct rt6_exception_bucket *bucket;
1683         struct rt6_exception *rt6_ex;
1684         int i;
1685
1686         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1687                                         lockdep_is_held(&rt6_exception_lock));
1688
1689         if (!bucket)
1690                 return;
1691
1692         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1693                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1694                         struct rt6_info *entry = rt6_ex->rt6i;
1695
1696                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1697                          * route), the metrics of its rt->from have already
1698                          * been updated.
1699                          */
1700                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1701                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1702                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1703                 }
1704                 bucket++;
1705         }
1706 }
1707
1708 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1709
1710 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1711                                         struct in6_addr *gateway)
1712 {
1713         struct rt6_exception_bucket *bucket;
1714         struct rt6_exception *rt6_ex;
1715         struct hlist_node *tmp;
1716         int i;
1717
1718         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1719                 return;
1720
1721         spin_lock_bh(&rt6_exception_lock);
1722         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1723                                      lockdep_is_held(&rt6_exception_lock));
1724
1725         if (bucket) {
1726                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1727                         hlist_for_each_entry_safe(rt6_ex, tmp,
1728                                                   &bucket->chain, hlist) {
1729                                 struct rt6_info *entry = rt6_ex->rt6i;
1730
1731                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1732                                     RTF_CACHE_GATEWAY &&
1733                                     ipv6_addr_equal(gateway,
1734                                                     &entry->rt6i_gateway)) {
1735                                         rt6_remove_exception(bucket, rt6_ex);
1736                                 }
1737                         }
1738                         bucket++;
1739                 }
1740         }
1741
1742         spin_unlock_bh(&rt6_exception_lock);
1743 }
1744
1745 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1746                                       struct rt6_exception *rt6_ex,
1747                                       struct fib6_gc_args *gc_args,
1748                                       unsigned long now)
1749 {
1750         struct rt6_info *rt = rt6_ex->rt6i;
1751
1752         /* we are pruning and obsoleting aged-out and non gateway exceptions
1753          * even if others have still references to them, so that on next
1754          * dst_check() such references can be dropped.
1755          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1756          * expired, independently from their aging, as per RFC 8201 section 4
1757          */
1758         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1759                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1760                         RT6_TRACE("aging clone %p\n", rt);
1761                         rt6_remove_exception(bucket, rt6_ex);
1762                         return;
1763                 }
1764         } else if (time_after(jiffies, rt->dst.expires)) {
1765                 RT6_TRACE("purging expired route %p\n", rt);
1766                 rt6_remove_exception(bucket, rt6_ex);
1767                 return;
1768         }
1769
1770         if (rt->rt6i_flags & RTF_GATEWAY) {
1771                 struct neighbour *neigh;
1772                 __u8 neigh_flags = 0;
1773
1774                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1775                 if (neigh)
1776                         neigh_flags = neigh->flags;
1777
1778                 if (!(neigh_flags & NTF_ROUTER)) {
1779                         RT6_TRACE("purging route %p via non-router but gateway\n",
1780                                   rt);
1781                         rt6_remove_exception(bucket, rt6_ex);
1782                         return;
1783                 }
1784         }
1785
1786         gc_args->more++;
1787 }
1788
1789 void rt6_age_exceptions(struct fib6_info *rt,
1790                         struct fib6_gc_args *gc_args,
1791                         unsigned long now)
1792 {
1793         struct rt6_exception_bucket *bucket;
1794         struct rt6_exception *rt6_ex;
1795         struct hlist_node *tmp;
1796         int i;
1797
1798         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1799                 return;
1800
1801         rcu_read_lock_bh();
1802         spin_lock(&rt6_exception_lock);
1803         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1804                                     lockdep_is_held(&rt6_exception_lock));
1805
1806         if (bucket) {
1807                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1808                         hlist_for_each_entry_safe(rt6_ex, tmp,
1809                                                   &bucket->chain, hlist) {
1810                                 rt6_age_examine_exception(bucket, rt6_ex,
1811                                                           gc_args, now);
1812                         }
1813                         bucket++;
1814                 }
1815         }
1816         spin_unlock(&rt6_exception_lock);
1817         rcu_read_unlock_bh();
1818 }
1819
1820 /* must be called with rcu lock held */
1821 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1822                                     int oif, struct flowi6 *fl6, int strict)
1823 {
1824         struct fib6_node *fn, *saved_fn;
1825         struct fib6_info *f6i;
1826
1827         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1828         saved_fn = fn;
1829
1830         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1831                 oif = 0;
1832
1833 redo_rt6_select:
1834         f6i = rt6_select(net, fn, oif, strict);
1835         if (f6i == net->ipv6.fib6_null_entry) {
1836                 fn = fib6_backtrack(fn, &fl6->saddr);
1837                 if (fn)
1838                         goto redo_rt6_select;
1839                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1840                         /* also consider unreachable route */
1841                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1842                         fn = saved_fn;
1843                         goto redo_rt6_select;
1844                 }
1845         }
1846
1847         trace_fib6_table_lookup(net, f6i, table, fl6);
1848
1849         return f6i;
1850 }
1851
1852 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1853                                int oif, struct flowi6 *fl6,
1854                                const struct sk_buff *skb, int flags)
1855 {
1856         struct fib6_info *f6i;
1857         struct rt6_info *rt;
1858         int strict = 0;
1859
1860         strict |= flags & RT6_LOOKUP_F_IFACE;
1861         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1862         if (net->ipv6.devconf_all->forwarding == 0)
1863                 strict |= RT6_LOOKUP_F_REACHABLE;
1864
1865         rcu_read_lock();
1866
1867         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1868         if (f6i->fib6_nsiblings)
1869                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1870
1871         if (f6i == net->ipv6.fib6_null_entry) {
1872                 rt = net->ipv6.ip6_null_entry;
1873                 rcu_read_unlock();
1874                 dst_hold(&rt->dst);
1875                 return rt;
1876         }
1877
1878         /*Search through exception table */
1879         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1880         if (rt) {
1881                 if (ip6_hold_safe(net, &rt, true))
1882                         dst_use_noref(&rt->dst, jiffies);
1883
1884                 rcu_read_unlock();
1885                 return rt;
1886         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1887                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1888                 /* Create a RTF_CACHE clone which will not be
1889                  * owned by the fib6 tree.  It is for the special case where
1890                  * the daddr in the skb during the neighbor look-up is different
1891                  * from the fl6->daddr used to look-up route here.
1892                  */
1893                 struct rt6_info *uncached_rt;
1894
1895                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1896
1897                 rcu_read_unlock();
1898
1899                 if (uncached_rt) {
1900                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1901                          * No need for another dst_hold()
1902                          */
1903                         rt6_uncached_list_add(uncached_rt);
1904                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1905                 } else {
1906                         uncached_rt = net->ipv6.ip6_null_entry;
1907                         dst_hold(&uncached_rt->dst);
1908                 }
1909
1910                 return uncached_rt;
1911         } else {
1912                 /* Get a percpu copy */
1913
1914                 struct rt6_info *pcpu_rt;
1915
1916                 local_bh_disable();
1917                 pcpu_rt = rt6_get_pcpu_route(f6i);
1918
1919                 if (!pcpu_rt)
1920                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1921
1922                 local_bh_enable();
1923                 rcu_read_unlock();
1924
1925                 return pcpu_rt;
1926         }
1927 }
1928 EXPORT_SYMBOL_GPL(ip6_pol_route);
1929
1930 static struct rt6_info *ip6_pol_route_input(struct net *net,
1931                                             struct fib6_table *table,
1932                                             struct flowi6 *fl6,
1933                                             const struct sk_buff *skb,
1934                                             int flags)
1935 {
1936         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1937 }
1938
1939 struct dst_entry *ip6_route_input_lookup(struct net *net,
1940                                          struct net_device *dev,
1941                                          struct flowi6 *fl6,
1942                                          const struct sk_buff *skb,
1943                                          int flags)
1944 {
1945         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1946                 flags |= RT6_LOOKUP_F_IFACE;
1947
1948         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1949 }
1950 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1951
1952 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1953                                   struct flow_keys *keys,
1954                                   struct flow_keys *flkeys)
1955 {
1956         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1957         const struct ipv6hdr *key_iph = outer_iph;
1958         struct flow_keys *_flkeys = flkeys;
1959         const struct ipv6hdr *inner_iph;
1960         const struct icmp6hdr *icmph;
1961         struct ipv6hdr _inner_iph;
1962         struct icmp6hdr _icmph;
1963
1964         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1965                 goto out;
1966
1967         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1968                                    sizeof(_icmph), &_icmph);
1969         if (!icmph)
1970                 goto out;
1971
1972         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1973             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1974             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1975             icmph->icmp6_type != ICMPV6_PARAMPROB)
1976                 goto out;
1977
1978         inner_iph = skb_header_pointer(skb,
1979                                        skb_transport_offset(skb) + sizeof(*icmph),
1980                                        sizeof(_inner_iph), &_inner_iph);
1981         if (!inner_iph)
1982                 goto out;
1983
1984         key_iph = inner_iph;
1985         _flkeys = NULL;
1986 out:
1987         if (_flkeys) {
1988                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1989                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1990                 keys->tags.flow_label = _flkeys->tags.flow_label;
1991                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1992         } else {
1993                 keys->addrs.v6addrs.src = key_iph->saddr;
1994                 keys->addrs.v6addrs.dst = key_iph->daddr;
1995                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1996                 keys->basic.ip_proto = key_iph->nexthdr;
1997         }
1998 }
1999
2000 /* if skb is set it will be used and fl6 can be NULL */
2001 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2002                        const struct sk_buff *skb, struct flow_keys *flkeys)
2003 {
2004         struct flow_keys hash_keys;
2005         u32 mhash;
2006
2007         switch (ip6_multipath_hash_policy(net)) {
2008         case 0:
2009                 memset(&hash_keys, 0, sizeof(hash_keys));
2010                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011                 if (skb) {
2012                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2013                 } else {
2014                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2015                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2016                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2017                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2018                 }
2019                 break;
2020         case 1:
2021                 if (skb) {
2022                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2023                         struct flow_keys keys;
2024
2025                         /* short-circuit if we already have L4 hash present */
2026                         if (skb->l4_hash)
2027                                 return skb_get_hash_raw(skb) >> 1;
2028
2029                         memset(&hash_keys, 0, sizeof(hash_keys));
2030
2031                         if (!flkeys) {
2032                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2033                                 flkeys = &keys;
2034                         }
2035                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2036                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2037                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2038                         hash_keys.ports.src = flkeys->ports.src;
2039                         hash_keys.ports.dst = flkeys->ports.dst;
2040                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2041                 } else {
2042                         memset(&hash_keys, 0, sizeof(hash_keys));
2043                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2044                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2045                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2046                         hash_keys.ports.src = fl6->fl6_sport;
2047                         hash_keys.ports.dst = fl6->fl6_dport;
2048                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2049                 }
2050                 break;
2051         }
2052         mhash = flow_hash_from_keys(&hash_keys);
2053
2054         return mhash >> 1;
2055 }
2056
2057 void ip6_route_input(struct sk_buff *skb)
2058 {
2059         const struct ipv6hdr *iph = ipv6_hdr(skb);
2060         struct net *net = dev_net(skb->dev);
2061         int flags = RT6_LOOKUP_F_HAS_SADDR;
2062         struct ip_tunnel_info *tun_info;
2063         struct flowi6 fl6 = {
2064                 .flowi6_iif = skb->dev->ifindex,
2065                 .daddr = iph->daddr,
2066                 .saddr = iph->saddr,
2067                 .flowlabel = ip6_flowinfo(iph),
2068                 .flowi6_mark = skb->mark,
2069                 .flowi6_proto = iph->nexthdr,
2070         };
2071         struct flow_keys *flkeys = NULL, _flkeys;
2072
2073         tun_info = skb_tunnel_info(skb);
2074         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2075                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2076
2077         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2078                 flkeys = &_flkeys;
2079
2080         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2081                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2082         skb_dst_drop(skb);
2083         skb_dst_set(skb,
2084                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2085 }
2086
2087 static struct rt6_info *ip6_pol_route_output(struct net *net,
2088                                              struct fib6_table *table,
2089                                              struct flowi6 *fl6,
2090                                              const struct sk_buff *skb,
2091                                              int flags)
2092 {
2093         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2094 }
2095
2096 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2097                                          struct flowi6 *fl6, int flags)
2098 {
2099         bool any_src;
2100
2101         if (rt6_need_strict(&fl6->daddr)) {
2102                 struct dst_entry *dst;
2103
2104                 dst = l3mdev_link_scope_lookup(net, fl6);
2105                 if (dst)
2106                         return dst;
2107         }
2108
2109         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2110
2111         any_src = ipv6_addr_any(&fl6->saddr);
2112         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2113             (fl6->flowi6_oif && any_src))
2114                 flags |= RT6_LOOKUP_F_IFACE;
2115
2116         if (!any_src)
2117                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2118         else if (sk)
2119                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2120
2121         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2122 }
2123 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2124
2125 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2126 {
2127         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2128         struct net_device *loopback_dev = net->loopback_dev;
2129         struct dst_entry *new = NULL;
2130
2131         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2132                        DST_OBSOLETE_DEAD, 0);
2133         if (rt) {
2134                 rt6_info_init(rt);
2135                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2136
2137                 new = &rt->dst;
2138                 new->__use = 1;
2139                 new->input = dst_discard;
2140                 new->output = dst_discard_out;
2141
2142                 dst_copy_metrics(new, &ort->dst);
2143
2144                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2145                 rt->rt6i_gateway = ort->rt6i_gateway;
2146                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2147
2148                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2149 #ifdef CONFIG_IPV6_SUBTREES
2150                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2151 #endif
2152         }
2153
2154         dst_release(dst_orig);
2155         return new ? new : ERR_PTR(-ENOMEM);
2156 }
2157
2158 /*
2159  *      Destination cache support functions
2160  */
2161
2162 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2163 {
2164         u32 rt_cookie = 0;
2165
2166         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2167                 return false;
2168
2169         if (fib6_check_expired(f6i))
2170                 return false;
2171
2172         return true;
2173 }
2174
2175 static struct dst_entry *rt6_check(struct rt6_info *rt,
2176                                    struct fib6_info *from,
2177                                    u32 cookie)
2178 {
2179         u32 rt_cookie = 0;
2180
2181         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2182             rt_cookie != cookie)
2183                 return NULL;
2184
2185         if (rt6_check_expired(rt))
2186                 return NULL;
2187
2188         return &rt->dst;
2189 }
2190
2191 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2192                                             struct fib6_info *from,
2193                                             u32 cookie)
2194 {
2195         if (!__rt6_check_expired(rt) &&
2196             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2197             fib6_check(from, cookie))
2198                 return &rt->dst;
2199         else
2200                 return NULL;
2201 }
2202
2203 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2204 {
2205         struct dst_entry *dst_ret;
2206         struct fib6_info *from;
2207         struct rt6_info *rt;
2208
2209         rt = container_of(dst, struct rt6_info, dst);
2210
2211         rcu_read_lock();
2212
2213         /* All IPV6 dsts are created with ->obsolete set to the value
2214          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2215          * into this function always.
2216          */
2217
2218         from = rcu_dereference(rt->from);
2219
2220         if (from && (rt->rt6i_flags & RTF_PCPU ||
2221             unlikely(!list_empty(&rt->rt6i_uncached))))
2222                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2223         else
2224                 dst_ret = rt6_check(rt, from, cookie);
2225
2226         rcu_read_unlock();
2227
2228         return dst_ret;
2229 }
2230
2231 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2232 {
2233         struct rt6_info *rt = (struct rt6_info *) dst;
2234
2235         if (rt) {
2236                 if (rt->rt6i_flags & RTF_CACHE) {
2237                         rcu_read_lock();
2238                         if (rt6_check_expired(rt)) {
2239                                 rt6_remove_exception_rt(rt);
2240                                 dst = NULL;
2241                         }
2242                         rcu_read_unlock();
2243                 } else {
2244                         dst_release(dst);
2245                         dst = NULL;
2246                 }
2247         }
2248         return dst;
2249 }
2250
2251 static void ip6_link_failure(struct sk_buff *skb)
2252 {
2253         struct rt6_info *rt;
2254
2255         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2256
2257         rt = (struct rt6_info *) skb_dst(skb);
2258         if (rt) {
2259                 rcu_read_lock();
2260                 if (rt->rt6i_flags & RTF_CACHE) {
2261                         if (dst_hold_safe(&rt->dst))
2262                                 rt6_remove_exception_rt(rt);
2263                 } else {
2264                         struct fib6_info *from;
2265                         struct fib6_node *fn;
2266
2267                         from = rcu_dereference(rt->from);
2268                         if (from) {
2269                                 fn = rcu_dereference(from->fib6_node);
2270                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2271                                         fn->fn_sernum = -1;
2272                         }
2273                 }
2274                 rcu_read_unlock();
2275         }
2276 }
2277
2278 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2279 {
2280         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2281                 struct fib6_info *from;
2282
2283                 rcu_read_lock();
2284                 from = rcu_dereference(rt0->from);
2285                 if (from)
2286                         rt0->dst.expires = from->expires;
2287                 rcu_read_unlock();
2288         }
2289
2290         dst_set_expires(&rt0->dst, timeout);
2291         rt0->rt6i_flags |= RTF_EXPIRES;
2292 }
2293
2294 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2295 {
2296         struct net *net = dev_net(rt->dst.dev);
2297
2298         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2299         rt->rt6i_flags |= RTF_MODIFIED;
2300         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2301 }
2302
2303 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2304 {
2305         bool from_set;
2306
2307         rcu_read_lock();
2308         from_set = !!rcu_dereference(rt->from);
2309         rcu_read_unlock();
2310
2311         return !(rt->rt6i_flags & RTF_CACHE) &&
2312                 (rt->rt6i_flags & RTF_PCPU || from_set);
2313 }
2314
2315 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2316                                  const struct ipv6hdr *iph, u32 mtu)
2317 {
2318         const struct in6_addr *daddr, *saddr;
2319         struct rt6_info *rt6 = (struct rt6_info *)dst;
2320
2321         if (dst_metric_locked(dst, RTAX_MTU))
2322                 return;
2323
2324         if (iph) {
2325                 daddr = &iph->daddr;
2326                 saddr = &iph->saddr;
2327         } else if (sk) {
2328                 daddr = &sk->sk_v6_daddr;
2329                 saddr = &inet6_sk(sk)->saddr;
2330         } else {
2331                 daddr = NULL;
2332                 saddr = NULL;
2333         }
2334         dst_confirm_neigh(dst, daddr);
2335         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2336         if (mtu >= dst_mtu(dst))
2337                 return;
2338
2339         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2340                 rt6_do_update_pmtu(rt6, mtu);
2341                 /* update rt6_ex->stamp for cache */
2342                 if (rt6->rt6i_flags & RTF_CACHE)
2343                         rt6_update_exception_stamp_rt(rt6);
2344         } else if (daddr) {
2345                 struct fib6_info *from;
2346                 struct rt6_info *nrt6;
2347
2348                 rcu_read_lock();
2349                 from = rcu_dereference(rt6->from);
2350                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2351                 if (nrt6) {
2352                         rt6_do_update_pmtu(nrt6, mtu);
2353                         if (rt6_insert_exception(nrt6, from))
2354                                 dst_release_immediate(&nrt6->dst);
2355                 }
2356                 rcu_read_unlock();
2357         }
2358 }
2359
2360 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2361                                struct sk_buff *skb, u32 mtu)
2362 {
2363         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2364 }
2365
2366 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2367                      int oif, u32 mark, kuid_t uid)
2368 {
2369         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2370         struct dst_entry *dst;
2371         struct flowi6 fl6;
2372
2373         memset(&fl6, 0, sizeof(fl6));
2374         fl6.flowi6_oif = oif;
2375         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2376         fl6.daddr = iph->daddr;
2377         fl6.saddr = iph->saddr;
2378         fl6.flowlabel = ip6_flowinfo(iph);
2379         fl6.flowi6_uid = uid;
2380
2381         dst = ip6_route_output(net, NULL, &fl6);
2382         if (!dst->error)
2383                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2384         dst_release(dst);
2385 }
2386 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2387
2388 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2389 {
2390         struct dst_entry *dst;
2391
2392         ip6_update_pmtu(skb, sock_net(sk), mtu,
2393                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2394
2395         dst = __sk_dst_get(sk);
2396         if (!dst || !dst->obsolete ||
2397             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2398                 return;
2399
2400         bh_lock_sock(sk);
2401         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2402                 ip6_datagram_dst_update(sk, false);
2403         bh_unlock_sock(sk);
2404 }
2405 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2406
2407 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2408                            const struct flowi6 *fl6)
2409 {
2410 #ifdef CONFIG_IPV6_SUBTREES
2411         struct ipv6_pinfo *np = inet6_sk(sk);
2412 #endif
2413
2414         ip6_dst_store(sk, dst,
2415                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2416                       &sk->sk_v6_daddr : NULL,
2417 #ifdef CONFIG_IPV6_SUBTREES
2418                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2419                       &np->saddr :
2420 #endif
2421                       NULL);
2422 }
2423
2424 /* Handle redirects */
2425 struct ip6rd_flowi {
2426         struct flowi6 fl6;
2427         struct in6_addr gateway;
2428 };
2429
2430 static struct rt6_info *__ip6_route_redirect(struct net *net,
2431                                              struct fib6_table *table,
2432                                              struct flowi6 *fl6,
2433                                              const struct sk_buff *skb,
2434                                              int flags)
2435 {
2436         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2437         struct rt6_info *ret = NULL, *rt_cache;
2438         struct fib6_info *rt;
2439         struct fib6_node *fn;
2440
2441         /* Get the "current" route for this destination and
2442          * check if the redirect has come from appropriate router.
2443          *
2444          * RFC 4861 specifies that redirects should only be
2445          * accepted if they come from the nexthop to the target.
2446          * Due to the way the routes are chosen, this notion
2447          * is a bit fuzzy and one might need to check all possible
2448          * routes.
2449          */
2450
2451         rcu_read_lock();
2452         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2453 restart:
2454         for_each_fib6_node_rt_rcu(fn) {
2455                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2456                         continue;
2457                 if (fib6_check_expired(rt))
2458                         continue;
2459                 if (rt->fib6_flags & RTF_REJECT)
2460                         break;
2461                 if (!(rt->fib6_flags & RTF_GATEWAY))
2462                         continue;
2463                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2464                         continue;
2465                 /* rt_cache's gateway might be different from its 'parent'
2466                  * in the case of an ip redirect.
2467                  * So we keep searching in the exception table if the gateway
2468                  * is different.
2469                  */
2470                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2471                         rt_cache = rt6_find_cached_rt(rt,
2472                                                       &fl6->daddr,
2473                                                       &fl6->saddr);
2474                         if (rt_cache &&
2475                             ipv6_addr_equal(&rdfl->gateway,
2476                                             &rt_cache->rt6i_gateway)) {
2477                                 ret = rt_cache;
2478                                 break;
2479                         }
2480                         continue;
2481                 }
2482                 break;
2483         }
2484
2485         if (!rt)
2486                 rt = net->ipv6.fib6_null_entry;
2487         else if (rt->fib6_flags & RTF_REJECT) {
2488                 ret = net->ipv6.ip6_null_entry;
2489                 goto out;
2490         }
2491
2492         if (rt == net->ipv6.fib6_null_entry) {
2493                 fn = fib6_backtrack(fn, &fl6->saddr);
2494                 if (fn)
2495                         goto restart;
2496         }
2497
2498 out:
2499         if (ret)
2500                 ip6_hold_safe(net, &ret, true);
2501         else
2502                 ret = ip6_create_rt_rcu(rt);
2503
2504         rcu_read_unlock();
2505
2506         trace_fib6_table_lookup(net, rt, table, fl6);
2507         return ret;
2508 };
2509
2510 static struct dst_entry *ip6_route_redirect(struct net *net,
2511                                             const struct flowi6 *fl6,
2512                                             const struct sk_buff *skb,
2513                                             const struct in6_addr *gateway)
2514 {
2515         int flags = RT6_LOOKUP_F_HAS_SADDR;
2516         struct ip6rd_flowi rdfl;
2517
2518         rdfl.fl6 = *fl6;
2519         rdfl.gateway = *gateway;
2520
2521         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2522                                 flags, __ip6_route_redirect);
2523 }
2524
2525 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2526                   kuid_t uid)
2527 {
2528         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2529         struct dst_entry *dst;
2530         struct flowi6 fl6;
2531
2532         memset(&fl6, 0, sizeof(fl6));
2533         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2534         fl6.flowi6_oif = oif;
2535         fl6.flowi6_mark = mark;
2536         fl6.daddr = iph->daddr;
2537         fl6.saddr = iph->saddr;
2538         fl6.flowlabel = ip6_flowinfo(iph);
2539         fl6.flowi6_uid = uid;
2540
2541         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2542         rt6_do_redirect(dst, NULL, skb);
2543         dst_release(dst);
2544 }
2545 EXPORT_SYMBOL_GPL(ip6_redirect);
2546
2547 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2548                             u32 mark)
2549 {
2550         const struct ipv6hdr *iph = ipv6_hdr(skb);
2551         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2552         struct dst_entry *dst;
2553         struct flowi6 fl6;
2554
2555         memset(&fl6, 0, sizeof(fl6));
2556         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2557         fl6.flowi6_oif = oif;
2558         fl6.flowi6_mark = mark;
2559         fl6.daddr = msg->dest;
2560         fl6.saddr = iph->daddr;
2561         fl6.flowi6_uid = sock_net_uid(net, NULL);
2562
2563         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2564         rt6_do_redirect(dst, NULL, skb);
2565         dst_release(dst);
2566 }
2567
2568 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2569 {
2570         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2571                      sk->sk_uid);
2572 }
2573 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2574
2575 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2576 {
2577         struct net_device *dev = dst->dev;
2578         unsigned int mtu = dst_mtu(dst);
2579         struct net *net = dev_net(dev);
2580
2581         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2582
2583         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2584                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2585
2586         /*
2587          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2588          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2589          * IPV6_MAXPLEN is also valid and means: "any MSS,
2590          * rely only on pmtu discovery"
2591          */
2592         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2593                 mtu = IPV6_MAXPLEN;
2594         return mtu;
2595 }
2596
2597 static unsigned int ip6_mtu(const struct dst_entry *dst)
2598 {
2599         struct inet6_dev *idev;
2600         unsigned int mtu;
2601
2602         mtu = dst_metric_raw(dst, RTAX_MTU);
2603         if (mtu)
2604                 goto out;
2605
2606         mtu = IPV6_MIN_MTU;
2607
2608         rcu_read_lock();
2609         idev = __in6_dev_get(dst->dev);
2610         if (idev)
2611                 mtu = idev->cnf.mtu6;
2612         rcu_read_unlock();
2613
2614 out:
2615         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2616
2617         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2618 }
2619
2620 /* MTU selection:
2621  * 1. mtu on route is locked - use it
2622  * 2. mtu from nexthop exception
2623  * 3. mtu from egress device
2624  *
2625  * based on ip6_dst_mtu_forward and exception logic of
2626  * rt6_find_cached_rt; called with rcu_read_lock
2627  */
2628 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2629                       struct in6_addr *saddr)
2630 {
2631         struct rt6_exception_bucket *bucket;
2632         struct rt6_exception *rt6_ex;
2633         struct in6_addr *src_key;
2634         struct inet6_dev *idev;
2635         u32 mtu = 0;
2636
2637         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2638                 mtu = f6i->fib6_pmtu;
2639                 if (mtu)
2640                         goto out;
2641         }
2642
2643         src_key = NULL;
2644 #ifdef CONFIG_IPV6_SUBTREES
2645         if (f6i->fib6_src.plen)
2646                 src_key = saddr;
2647 #endif
2648
2649         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2650         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2651         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2652                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2653
2654         if (likely(!mtu)) {
2655                 struct net_device *dev = fib6_info_nh_dev(f6i);
2656
2657                 mtu = IPV6_MIN_MTU;
2658                 idev = __in6_dev_get(dev);
2659                 if (idev && idev->cnf.mtu6 > mtu)
2660                         mtu = idev->cnf.mtu6;
2661         }
2662
2663         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2664 out:
2665         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2666 }
2667
2668 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2669                                   struct flowi6 *fl6)
2670 {
2671         struct dst_entry *dst;
2672         struct rt6_info *rt;
2673         struct inet6_dev *idev = in6_dev_get(dev);
2674         struct net *net = dev_net(dev);
2675
2676         if (unlikely(!idev))
2677                 return ERR_PTR(-ENODEV);
2678
2679         rt = ip6_dst_alloc(net, dev, 0);
2680         if (unlikely(!rt)) {
2681                 in6_dev_put(idev);
2682                 dst = ERR_PTR(-ENOMEM);
2683                 goto out;
2684         }
2685
2686         rt->dst.flags |= DST_HOST;
2687         rt->dst.input = ip6_input;
2688         rt->dst.output  = ip6_output;
2689         rt->rt6i_gateway  = fl6->daddr;
2690         rt->rt6i_dst.addr = fl6->daddr;
2691         rt->rt6i_dst.plen = 128;
2692         rt->rt6i_idev     = idev;
2693         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2694
2695         /* Add this dst into uncached_list so that rt6_disable_ip() can
2696          * do proper release of the net_device
2697          */
2698         rt6_uncached_list_add(rt);
2699         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2700
2701         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2702
2703 out:
2704         return dst;
2705 }
2706
2707 static int ip6_dst_gc(struct dst_ops *ops)
2708 {
2709         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2710         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2711         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2712         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2713         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2714         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2715         int entries;
2716
2717         entries = dst_entries_get_fast(ops);
2718         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2719             entries <= rt_max_size)
2720                 goto out;
2721
2722         net->ipv6.ip6_rt_gc_expire++;
2723         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2724         entries = dst_entries_get_slow(ops);
2725         if (entries < ops->gc_thresh)
2726                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2727 out:
2728         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2729         return entries > rt_max_size;
2730 }
2731
2732 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2733                                struct fib6_config *cfg)
2734 {
2735         struct dst_metrics *p;
2736
2737         if (!cfg->fc_mx)
2738                 return 0;
2739
2740         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2741         if (unlikely(!p))
2742                 return -ENOMEM;
2743
2744         refcount_set(&p->refcnt, 1);
2745         rt->fib6_metrics = p;
2746
2747         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2748 }
2749
2750 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2751                                             struct fib6_config *cfg,
2752                                             const struct in6_addr *gw_addr,
2753                                             u32 tbid, int flags)
2754 {
2755         struct flowi6 fl6 = {
2756                 .flowi6_oif = cfg->fc_ifindex,
2757                 .daddr = *gw_addr,
2758                 .saddr = cfg->fc_prefsrc,
2759         };
2760         struct fib6_table *table;
2761         struct rt6_info *rt;
2762
2763         table = fib6_get_table(net, tbid);
2764         if (!table)
2765                 return NULL;
2766
2767         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2768                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2769
2770         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2771         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2772
2773         /* if table lookup failed, fall back to full lookup */
2774         if (rt == net->ipv6.ip6_null_entry) {
2775                 ip6_rt_put(rt);
2776                 rt = NULL;
2777         }
2778
2779         return rt;
2780 }
2781
2782 static int ip6_route_check_nh_onlink(struct net *net,
2783                                      struct fib6_config *cfg,
2784                                      const struct net_device *dev,
2785                                      struct netlink_ext_ack *extack)
2786 {
2787         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2788         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2789         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2790         struct rt6_info *grt;
2791         int err;
2792
2793         err = 0;
2794         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2795         if (grt) {
2796                 if (!grt->dst.error &&
2797                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2798                         NL_SET_ERR_MSG(extack,
2799                                        "Nexthop has invalid gateway or device mismatch");
2800                         err = -EINVAL;
2801                 }
2802
2803                 ip6_rt_put(grt);
2804         }
2805
2806         return err;
2807 }
2808
2809 static int ip6_route_check_nh(struct net *net,
2810                               struct fib6_config *cfg,
2811                               struct net_device **_dev,
2812                               struct inet6_dev **idev)
2813 {
2814         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2815         struct net_device *dev = _dev ? *_dev : NULL;
2816         struct rt6_info *grt = NULL;
2817         int err = -EHOSTUNREACH;
2818
2819         if (cfg->fc_table) {
2820                 int flags = RT6_LOOKUP_F_IFACE;
2821
2822                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2823                                           cfg->fc_table, flags);
2824                 if (grt) {
2825                         if (grt->rt6i_flags & RTF_GATEWAY ||
2826                             (dev && dev != grt->dst.dev)) {
2827                                 ip6_rt_put(grt);
2828                                 grt = NULL;
2829                         }
2830                 }
2831         }
2832
2833         if (!grt)
2834                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2835
2836         if (!grt)
2837                 goto out;
2838
2839         if (dev) {
2840                 if (dev != grt->dst.dev) {
2841                         ip6_rt_put(grt);
2842                         goto out;
2843                 }
2844         } else {
2845                 *_dev = dev = grt->dst.dev;
2846                 *idev = grt->rt6i_idev;
2847                 dev_hold(dev);
2848                 in6_dev_hold(grt->rt6i_idev);
2849         }
2850
2851         if (!(grt->rt6i_flags & RTF_GATEWAY))
2852                 err = 0;
2853
2854         ip6_rt_put(grt);
2855
2856 out:
2857         return err;
2858 }
2859
2860 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2861                            struct net_device **_dev, struct inet6_dev **idev,
2862                            struct netlink_ext_ack *extack)
2863 {
2864         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2865         int gwa_type = ipv6_addr_type(gw_addr);
2866         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2867         const struct net_device *dev = *_dev;
2868         bool need_addr_check = !dev;
2869         int err = -EINVAL;
2870
2871         /* if gw_addr is local we will fail to detect this in case
2872          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2873          * will return already-added prefix route via interface that
2874          * prefix route was assigned to, which might be non-loopback.
2875          */
2876         if (dev &&
2877             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2878                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2879                 goto out;
2880         }
2881
2882         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2883                 /* IPv6 strictly inhibits using not link-local
2884                  * addresses as nexthop address.
2885                  * Otherwise, router will not able to send redirects.
2886                  * It is very good, but in some (rare!) circumstances
2887                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2888                  * some exceptions. --ANK
2889                  * We allow IPv4-mapped nexthops to support RFC4798-type
2890                  * addressing
2891                  */
2892                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2893                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2894                         goto out;
2895                 }
2896
2897                 if (cfg->fc_flags & RTNH_F_ONLINK)
2898                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2899                 else
2900                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2901
2902                 if (err)
2903                         goto out;
2904         }
2905
2906         /* reload in case device was changed */
2907         dev = *_dev;
2908
2909         err = -EINVAL;
2910         if (!dev) {
2911                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2912                 goto out;
2913         } else if (dev->flags & IFF_LOOPBACK) {
2914                 NL_SET_ERR_MSG(extack,
2915                                "Egress device can not be loopback device for this route");
2916                 goto out;
2917         }
2918
2919         /* if we did not check gw_addr above, do so now that the
2920          * egress device has been resolved.
2921          */
2922         if (need_addr_check &&
2923             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2924                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2925                 goto out;
2926         }
2927
2928         err = 0;
2929 out:
2930         return err;
2931 }
2932
2933 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2934                                               gfp_t gfp_flags,
2935                                               struct netlink_ext_ack *extack)
2936 {
2937         struct net *net = cfg->fc_nlinfo.nl_net;
2938         struct fib6_info *rt = NULL;
2939         struct net_device *dev = NULL;
2940         struct inet6_dev *idev = NULL;
2941         struct fib6_table *table;
2942         int addr_type;
2943         int err = -EINVAL;
2944
2945         /* RTF_PCPU is an internal flag; can not be set by userspace */
2946         if (cfg->fc_flags & RTF_PCPU) {
2947                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2948                 goto out;
2949         }
2950
2951         /* RTF_CACHE is an internal flag; can not be set by userspace */
2952         if (cfg->fc_flags & RTF_CACHE) {
2953                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2954                 goto out;
2955         }
2956
2957         if (cfg->fc_type > RTN_MAX) {
2958                 NL_SET_ERR_MSG(extack, "Invalid route type");
2959                 goto out;
2960         }
2961
2962         if (cfg->fc_dst_len > 128) {
2963                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2964                 goto out;
2965         }
2966         if (cfg->fc_src_len > 128) {
2967                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2968                 goto out;
2969         }
2970 #ifndef CONFIG_IPV6_SUBTREES
2971         if (cfg->fc_src_len) {
2972                 NL_SET_ERR_MSG(extack,
2973                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2974                 goto out;
2975         }
2976 #endif
2977         if (cfg->fc_ifindex) {
2978                 err = -ENODEV;
2979                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2980                 if (!dev)
2981                         goto out;
2982                 idev = in6_dev_get(dev);
2983                 if (!idev)
2984                         goto out;
2985         }
2986
2987         if (cfg->fc_metric == 0)
2988                 cfg->fc_metric = IP6_RT_PRIO_USER;
2989
2990         if (cfg->fc_flags & RTNH_F_ONLINK) {
2991                 if (!dev) {
2992                         NL_SET_ERR_MSG(extack,
2993                                        "Nexthop device required for onlink");
2994                         err = -ENODEV;
2995                         goto out;
2996                 }
2997
2998                 if (!(dev->flags & IFF_UP)) {
2999                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3000                         err = -ENETDOWN;
3001                         goto out;
3002                 }
3003         }
3004
3005         err = -ENOBUFS;
3006         if (cfg->fc_nlinfo.nlh &&
3007             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3008                 table = fib6_get_table(net, cfg->fc_table);
3009                 if (!table) {
3010                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3011                         table = fib6_new_table(net, cfg->fc_table);
3012                 }
3013         } else {
3014                 table = fib6_new_table(net, cfg->fc_table);
3015         }
3016
3017         if (!table)
3018                 goto out;
3019
3020         err = -ENOMEM;
3021         rt = fib6_info_alloc(gfp_flags);
3022         if (!rt)
3023                 goto out;
3024
3025         if (cfg->fc_flags & RTF_ADDRCONF)
3026                 rt->dst_nocount = true;
3027
3028         err = ip6_convert_metrics(net, rt, cfg);
3029         if (err < 0)
3030                 goto out;
3031
3032         if (cfg->fc_flags & RTF_EXPIRES)
3033                 fib6_set_expires(rt, jiffies +
3034                                 clock_t_to_jiffies(cfg->fc_expires));
3035         else
3036                 fib6_clean_expires(rt);
3037
3038         if (cfg->fc_protocol == RTPROT_UNSPEC)
3039                 cfg->fc_protocol = RTPROT_BOOT;
3040         rt->fib6_protocol = cfg->fc_protocol;
3041
3042         addr_type = ipv6_addr_type(&cfg->fc_dst);
3043
3044         if (cfg->fc_encap) {
3045                 struct lwtunnel_state *lwtstate;
3046
3047                 err = lwtunnel_build_state(cfg->fc_encap_type,
3048                                            cfg->fc_encap, AF_INET6, cfg,
3049                                            &lwtstate, extack);
3050                 if (err)
3051                         goto out;
3052                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3053         }
3054
3055         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3056         rt->fib6_dst.plen = cfg->fc_dst_len;
3057         if (rt->fib6_dst.plen == 128)
3058                 rt->dst_host = true;
3059
3060 #ifdef CONFIG_IPV6_SUBTREES
3061         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3062         rt->fib6_src.plen = cfg->fc_src_len;
3063 #endif
3064
3065         rt->fib6_metric = cfg->fc_metric;
3066         rt->fib6_nh.nh_weight = 1;
3067
3068         rt->fib6_type = cfg->fc_type;
3069
3070         /* We cannot add true routes via loopback here,
3071            they would result in kernel looping; promote them to reject routes
3072          */
3073         if ((cfg->fc_flags & RTF_REJECT) ||
3074             (dev && (dev->flags & IFF_LOOPBACK) &&
3075              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3076              !(cfg->fc_flags & RTF_LOCAL))) {
3077                 /* hold loopback dev/idev if we haven't done so. */
3078                 if (dev != net->loopback_dev) {
3079                         if (dev) {
3080                                 dev_put(dev);
3081                                 in6_dev_put(idev);
3082                         }
3083                         dev = net->loopback_dev;
3084                         dev_hold(dev);
3085                         idev = in6_dev_get(dev);
3086                         if (!idev) {
3087                                 err = -ENODEV;
3088                                 goto out;
3089                         }
3090                 }
3091                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3092                 goto install_route;
3093         }
3094
3095         if (cfg->fc_flags & RTF_GATEWAY) {
3096                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3097                 if (err)
3098                         goto out;
3099
3100                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3101         }
3102
3103         err = -ENODEV;
3104         if (!dev)
3105                 goto out;
3106
3107         if (idev->cnf.disable_ipv6) {
3108                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3109                 err = -EACCES;
3110                 goto out;
3111         }
3112
3113         if (!(dev->flags & IFF_UP)) {
3114                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3115                 err = -ENETDOWN;
3116                 goto out;
3117         }
3118
3119         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3120                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3121                         NL_SET_ERR_MSG(extack, "Invalid source address");
3122                         err = -EINVAL;
3123                         goto out;
3124                 }
3125                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3126                 rt->fib6_prefsrc.plen = 128;
3127         } else
3128                 rt->fib6_prefsrc.plen = 0;
3129
3130         rt->fib6_flags = cfg->fc_flags;
3131
3132 install_route:
3133         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3134             !netif_carrier_ok(dev))
3135                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3136         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3137         rt->fib6_nh.nh_dev = dev;
3138         rt->fib6_table = table;
3139
3140         cfg->fc_nlinfo.nl_net = dev_net(dev);
3141
3142         if (idev)
3143                 in6_dev_put(idev);
3144
3145         return rt;
3146 out:
3147         if (dev)
3148                 dev_put(dev);
3149         if (idev)
3150                 in6_dev_put(idev);
3151
3152         fib6_info_release(rt);
3153         return ERR_PTR(err);
3154 }
3155
3156 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3157                   struct netlink_ext_ack *extack)
3158 {
3159         struct fib6_info *rt;
3160         int err;
3161
3162         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3163         if (IS_ERR(rt))
3164                 return PTR_ERR(rt);
3165
3166         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3167         fib6_info_release(rt);
3168
3169         return err;
3170 }
3171
3172 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3173 {
3174         struct net *net = info->nl_net;
3175         struct fib6_table *table;
3176         int err;
3177
3178         if (rt == net->ipv6.fib6_null_entry) {
3179                 err = -ENOENT;
3180                 goto out;
3181         }
3182
3183         table = rt->fib6_table;
3184         spin_lock_bh(&table->tb6_lock);
3185         err = fib6_del(rt, info);
3186         spin_unlock_bh(&table->tb6_lock);
3187
3188 out:
3189         fib6_info_release(rt);
3190         return err;
3191 }
3192
3193 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3194 {
3195         struct nl_info info = { .nl_net = net };
3196
3197         return __ip6_del_rt(rt, &info);
3198 }
3199
3200 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3201 {
3202         struct nl_info *info = &cfg->fc_nlinfo;
3203         struct net *net = info->nl_net;
3204         struct sk_buff *skb = NULL;
3205         struct fib6_table *table;
3206         int err = -ENOENT;
3207
3208         if (rt == net->ipv6.fib6_null_entry)
3209                 goto out_put;
3210         table = rt->fib6_table;
3211         spin_lock_bh(&table->tb6_lock);
3212
3213         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3214                 struct fib6_info *sibling, *next_sibling;
3215
3216                 /* prefer to send a single notification with all hops */
3217                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3218                 if (skb) {
3219                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3220
3221                         if (rt6_fill_node(net, skb, rt, NULL,
3222                                           NULL, NULL, 0, RTM_DELROUTE,
3223                                           info->portid, seq, 0) < 0) {
3224                                 kfree_skb(skb);
3225                                 skb = NULL;
3226                         } else
3227                                 info->skip_notify = 1;
3228                 }
3229
3230                 list_for_each_entry_safe(sibling, next_sibling,
3231                                          &rt->fib6_siblings,
3232                                          fib6_siblings) {
3233                         err = fib6_del(sibling, info);
3234                         if (err)
3235                                 goto out_unlock;
3236                 }
3237         }
3238
3239         err = fib6_del(rt, info);
3240 out_unlock:
3241         spin_unlock_bh(&table->tb6_lock);
3242 out_put:
3243         fib6_info_release(rt);
3244
3245         if (skb) {
3246                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3247                             info->nlh, gfp_any());
3248         }
3249         return err;
3250 }
3251
3252 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3253 {
3254         int rc = -ESRCH;
3255
3256         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3257                 goto out;
3258
3259         if (cfg->fc_flags & RTF_GATEWAY &&
3260             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3261                 goto out;
3262         if (dst_hold_safe(&rt->dst))
3263                 rc = rt6_remove_exception_rt(rt);
3264 out:
3265         return rc;
3266 }
3267
3268 static int ip6_route_del(struct fib6_config *cfg,
3269                          struct netlink_ext_ack *extack)
3270 {
3271         struct rt6_info *rt_cache;
3272         struct fib6_table *table;
3273         struct fib6_info *rt;
3274         struct fib6_node *fn;
3275         int err = -ESRCH;
3276
3277         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3278         if (!table) {
3279                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3280                 return err;
3281         }
3282
3283         rcu_read_lock();
3284
3285         fn = fib6_locate(&table->tb6_root,
3286                          &cfg->fc_dst, cfg->fc_dst_len,
3287                          &cfg->fc_src, cfg->fc_src_len,
3288                          !(cfg->fc_flags & RTF_CACHE));
3289
3290         if (fn) {
3291                 for_each_fib6_node_rt_rcu(fn) {
3292                         if (cfg->fc_flags & RTF_CACHE) {
3293                                 int rc;
3294
3295                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3296                                                               &cfg->fc_src);
3297                                 if (rt_cache) {
3298                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3299                                         if (rc != -ESRCH) {
3300                                                 rcu_read_unlock();
3301                                                 return rc;
3302                                         }
3303                                 }
3304                                 continue;
3305                         }
3306                         if (cfg->fc_ifindex &&
3307                             (!rt->fib6_nh.nh_dev ||
3308                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3309                                 continue;
3310                         if (cfg->fc_flags & RTF_GATEWAY &&
3311                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3312                                 continue;
3313                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3314                                 continue;
3315                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3316                                 continue;
3317                         if (!fib6_info_hold_safe(rt))
3318                                 continue;
3319                         rcu_read_unlock();
3320
3321                         /* if gateway was specified only delete the one hop */
3322                         if (cfg->fc_flags & RTF_GATEWAY)
3323                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3324
3325                         return __ip6_del_rt_siblings(rt, cfg);
3326                 }
3327         }
3328         rcu_read_unlock();
3329
3330         return err;
3331 }
3332
3333 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3334 {
3335         struct netevent_redirect netevent;
3336         struct rt6_info *rt, *nrt = NULL;
3337         struct ndisc_options ndopts;
3338         struct inet6_dev *in6_dev;
3339         struct neighbour *neigh;
3340         struct fib6_info *from;
3341         struct rd_msg *msg;
3342         int optlen, on_link;
3343         u8 *lladdr;
3344
3345         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3346         optlen -= sizeof(*msg);
3347
3348         if (optlen < 0) {
3349                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3350                 return;
3351         }
3352
3353         msg = (struct rd_msg *)icmp6_hdr(skb);
3354
3355         if (ipv6_addr_is_multicast(&msg->dest)) {
3356                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3357                 return;
3358         }
3359
3360         on_link = 0;
3361         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3362                 on_link = 1;
3363         } else if (ipv6_addr_type(&msg->target) !=
3364                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3365                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3366                 return;
3367         }
3368
3369         in6_dev = __in6_dev_get(skb->dev);
3370         if (!in6_dev)
3371                 return;
3372         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3373                 return;
3374
3375         /* RFC2461 8.1:
3376          *      The IP source address of the Redirect MUST be the same as the current
3377          *      first-hop router for the specified ICMP Destination Address.
3378          */
3379
3380         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3381                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3382                 return;
3383         }
3384
3385         lladdr = NULL;
3386         if (ndopts.nd_opts_tgt_lladdr) {
3387                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3388                                              skb->dev);
3389                 if (!lladdr) {
3390                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3391                         return;
3392                 }
3393         }
3394
3395         rt = (struct rt6_info *) dst;
3396         if (rt->rt6i_flags & RTF_REJECT) {
3397                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3398                 return;
3399         }
3400
3401         /* Redirect received -> path was valid.
3402          * Look, redirects are sent only in response to data packets,
3403          * so that this nexthop apparently is reachable. --ANK
3404          */
3405         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3406
3407         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3408         if (!neigh)
3409                 return;
3410
3411         /*
3412          *      We have finally decided to accept it.
3413          */
3414
3415         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3416                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3417                      NEIGH_UPDATE_F_OVERRIDE|
3418                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3419                                      NEIGH_UPDATE_F_ISROUTER)),
3420                      NDISC_REDIRECT, &ndopts);
3421
3422         rcu_read_lock();
3423         from = rcu_dereference(rt->from);
3424         /* This fib6_info_hold() is safe here because we hold reference to rt
3425          * and rt already holds reference to fib6_info.
3426          */
3427         fib6_info_hold(from);
3428         rcu_read_unlock();
3429
3430         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3431         if (!nrt)
3432                 goto out;
3433
3434         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3435         if (on_link)
3436                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3437
3438         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3439
3440         /* No need to remove rt from the exception table if rt is
3441          * a cached route because rt6_insert_exception() will
3442          * takes care of it
3443          */
3444         if (rt6_insert_exception(nrt, from)) {
3445                 dst_release_immediate(&nrt->dst);
3446                 goto out;
3447         }
3448
3449         netevent.old = &rt->dst;
3450         netevent.new = &nrt->dst;
3451         netevent.daddr = &msg->dest;
3452         netevent.neigh = neigh;
3453         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3454
3455 out:
3456         fib6_info_release(from);
3457         neigh_release(neigh);
3458 }
3459
3460 #ifdef CONFIG_IPV6_ROUTE_INFO
3461 static struct fib6_info *rt6_get_route_info(struct net *net,
3462                                            const struct in6_addr *prefix, int prefixlen,
3463                                            const struct in6_addr *gwaddr,
3464                                            struct net_device *dev)
3465 {
3466         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3467         int ifindex = dev->ifindex;
3468         struct fib6_node *fn;
3469         struct fib6_info *rt = NULL;
3470         struct fib6_table *table;
3471
3472         table = fib6_get_table(net, tb_id);
3473         if (!table)
3474                 return NULL;
3475
3476         rcu_read_lock();
3477         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3478         if (!fn)
3479                 goto out;
3480
3481         for_each_fib6_node_rt_rcu(fn) {
3482                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3483                         continue;
3484                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3485                         continue;
3486                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3487                         continue;
3488                 if (!fib6_info_hold_safe(rt))
3489                         continue;
3490                 break;
3491         }
3492 out:
3493         rcu_read_unlock();
3494         return rt;
3495 }
3496
3497 static struct fib6_info *rt6_add_route_info(struct net *net,
3498                                            const struct in6_addr *prefix, int prefixlen,
3499                                            const struct in6_addr *gwaddr,
3500                                            struct net_device *dev,
3501                                            unsigned int pref)
3502 {
3503         struct fib6_config cfg = {
3504                 .fc_metric      = IP6_RT_PRIO_USER,
3505                 .fc_ifindex     = dev->ifindex,
3506                 .fc_dst_len     = prefixlen,
3507                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3508                                   RTF_UP | RTF_PREF(pref),
3509                 .fc_protocol = RTPROT_RA,
3510                 .fc_type = RTN_UNICAST,
3511                 .fc_nlinfo.portid = 0,
3512                 .fc_nlinfo.nlh = NULL,
3513                 .fc_nlinfo.nl_net = net,
3514         };
3515
3516         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3517         cfg.fc_dst = *prefix;
3518         cfg.fc_gateway = *gwaddr;
3519
3520         /* We should treat it as a default route if prefix length is 0. */
3521         if (!prefixlen)
3522                 cfg.fc_flags |= RTF_DEFAULT;
3523
3524         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3525
3526         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3527 }
3528 #endif
3529
3530 struct fib6_info *rt6_get_dflt_router(struct net *net,
3531                                      const struct in6_addr *addr,
3532                                      struct net_device *dev)
3533 {
3534         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3535         struct fib6_info *rt;
3536         struct fib6_table *table;
3537
3538         table = fib6_get_table(net, tb_id);
3539         if (!table)
3540                 return NULL;
3541
3542         rcu_read_lock();
3543         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3544                 if (dev == rt->fib6_nh.nh_dev &&
3545                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3546                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3547                         break;
3548         }
3549         if (rt && !fib6_info_hold_safe(rt))
3550                 rt = NULL;
3551         rcu_read_unlock();
3552         return rt;
3553 }
3554
3555 struct fib6_info *rt6_add_dflt_router(struct net *net,
3556                                      const struct in6_addr *gwaddr,
3557                                      struct net_device *dev,
3558                                      unsigned int pref)
3559 {
3560         struct fib6_config cfg = {
3561                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3562                 .fc_metric      = IP6_RT_PRIO_USER,
3563                 .fc_ifindex     = dev->ifindex,
3564                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3565                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3566                 .fc_protocol = RTPROT_RA,
3567                 .fc_type = RTN_UNICAST,
3568                 .fc_nlinfo.portid = 0,
3569                 .fc_nlinfo.nlh = NULL,
3570                 .fc_nlinfo.nl_net = net,
3571         };
3572
3573         cfg.fc_gateway = *gwaddr;
3574
3575         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3576                 struct fib6_table *table;
3577
3578                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3579                 if (table)
3580                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3581         }
3582
3583         return rt6_get_dflt_router(net, gwaddr, dev);
3584 }
3585
3586 static void __rt6_purge_dflt_routers(struct net *net,
3587                                      struct fib6_table *table)
3588 {
3589         struct fib6_info *rt;
3590
3591 restart:
3592         rcu_read_lock();
3593         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3594                 struct net_device *dev = fib6_info_nh_dev(rt);
3595                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3596
3597                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3598                     (!idev || idev->cnf.accept_ra != 2) &&
3599                     fib6_info_hold_safe(rt)) {
3600                         rcu_read_unlock();
3601                         ip6_del_rt(net, rt);
3602                         goto restart;
3603                 }
3604         }
3605         rcu_read_unlock();
3606
3607         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3608 }
3609
3610 void rt6_purge_dflt_routers(struct net *net)
3611 {
3612         struct fib6_table *table;
3613         struct hlist_head *head;
3614         unsigned int h;
3615
3616         rcu_read_lock();
3617
3618         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3619                 head = &net->ipv6.fib_table_hash[h];
3620                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3621                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3622                                 __rt6_purge_dflt_routers(net, table);
3623                 }
3624         }
3625
3626         rcu_read_unlock();
3627 }
3628
3629 static void rtmsg_to_fib6_config(struct net *net,
3630                                  struct in6_rtmsg *rtmsg,
3631                                  struct fib6_config *cfg)
3632 {
3633         memset(cfg, 0, sizeof(*cfg));
3634
3635         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3636                          : RT6_TABLE_MAIN;
3637         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3638         cfg->fc_metric = rtmsg->rtmsg_metric;
3639         cfg->fc_expires = rtmsg->rtmsg_info;
3640         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3641         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3642         cfg->fc_flags = rtmsg->rtmsg_flags;
3643         cfg->fc_type = rtmsg->rtmsg_type;
3644
3645         cfg->fc_nlinfo.nl_net = net;
3646
3647         cfg->fc_dst = rtmsg->rtmsg_dst;
3648         cfg->fc_src = rtmsg->rtmsg_src;
3649         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3650 }
3651
3652 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3653 {
3654         struct fib6_config cfg;
3655         struct in6_rtmsg rtmsg;
3656         int err;
3657
3658         switch (cmd) {
3659         case SIOCADDRT:         /* Add a route */
3660         case SIOCDELRT:         /* Delete a route */
3661                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3662                         return -EPERM;
3663                 err = copy_from_user(&rtmsg, arg,
3664                                      sizeof(struct in6_rtmsg));
3665                 if (err)
3666                         return -EFAULT;
3667
3668                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3669
3670                 rtnl_lock();
3671                 switch (cmd) {
3672                 case SIOCADDRT:
3673                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3674                         break;
3675                 case SIOCDELRT:
3676                         err = ip6_route_del(&cfg, NULL);
3677                         break;
3678                 default:
3679                         err = -EINVAL;
3680                 }
3681                 rtnl_unlock();
3682
3683                 return err;
3684         }
3685
3686         return -EINVAL;
3687 }
3688
3689 /*
3690  *      Drop the packet on the floor
3691  */
3692
3693 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3694 {
3695         int type;
3696         struct dst_entry *dst = skb_dst(skb);
3697         switch (ipstats_mib_noroutes) {
3698         case IPSTATS_MIB_INNOROUTES:
3699                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3700                 if (type == IPV6_ADDR_ANY) {
3701                         IP6_INC_STATS(dev_net(dst->dev),
3702                                       __in6_dev_get_safely(skb->dev),
3703                                       IPSTATS_MIB_INADDRERRORS);
3704                         break;
3705                 }
3706                 /* FALLTHROUGH */
3707         case IPSTATS_MIB_OUTNOROUTES:
3708                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3709                               ipstats_mib_noroutes);
3710                 break;
3711         }
3712         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3713         kfree_skb(skb);
3714         return 0;
3715 }
3716
3717 static int ip6_pkt_discard(struct sk_buff *skb)
3718 {
3719         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3720 }
3721
3722 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3723 {
3724         skb->dev = skb_dst(skb)->dev;
3725         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3726 }
3727
3728 static int ip6_pkt_prohibit(struct sk_buff *skb)
3729 {
3730         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3731 }
3732
3733 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3734 {
3735         skb->dev = skb_dst(skb)->dev;
3736         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3737 }
3738
3739 /*
3740  *      Allocate a dst for local (unicast / anycast) address.
3741  */
3742
3743 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3744                                      struct inet6_dev *idev,
3745                                      const struct in6_addr *addr,
3746                                      bool anycast, gfp_t gfp_flags)
3747 {
3748         u32 tb_id;
3749         struct net_device *dev = idev->dev;
3750         struct fib6_info *f6i;
3751
3752         f6i = fib6_info_alloc(gfp_flags);
3753         if (!f6i)
3754                 return ERR_PTR(-ENOMEM);
3755
3756         f6i->dst_nocount = true;
3757         f6i->dst_host = true;
3758         f6i->fib6_protocol = RTPROT_KERNEL;
3759         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3760         if (anycast) {
3761                 f6i->fib6_type = RTN_ANYCAST;
3762                 f6i->fib6_flags |= RTF_ANYCAST;
3763         } else {
3764                 f6i->fib6_type = RTN_LOCAL;
3765                 f6i->fib6_flags |= RTF_LOCAL;
3766         }
3767
3768         f6i->fib6_nh.nh_gw = *addr;
3769         dev_hold(dev);
3770         f6i->fib6_nh.nh_dev = dev;
3771         f6i->fib6_dst.addr = *addr;
3772         f6i->fib6_dst.plen = 128;
3773         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3774         f6i->fib6_table = fib6_get_table(net, tb_id);
3775
3776         return f6i;
3777 }
3778
3779 /* remove deleted ip from prefsrc entries */
3780 struct arg_dev_net_ip {
3781         struct net_device *dev;
3782         struct net *net;
3783         struct in6_addr *addr;
3784 };
3785
3786 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3787 {
3788         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3789         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3790         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3791
3792         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3793             rt != net->ipv6.fib6_null_entry &&
3794             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3795                 spin_lock_bh(&rt6_exception_lock);
3796                 /* remove prefsrc entry */
3797                 rt->fib6_prefsrc.plen = 0;
3798                 /* need to update cache as well */
3799                 rt6_exceptions_remove_prefsrc(rt);
3800                 spin_unlock_bh(&rt6_exception_lock);
3801         }
3802         return 0;
3803 }
3804
3805 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3806 {
3807         struct net *net = dev_net(ifp->idev->dev);
3808         struct arg_dev_net_ip adni = {
3809                 .dev = ifp->idev->dev,
3810                 .net = net,
3811                 .addr = &ifp->addr,
3812         };
3813         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3814 }
3815
3816 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3817
3818 /* Remove routers and update dst entries when gateway turn into host. */
3819 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3820 {
3821         struct in6_addr *gateway = (struct in6_addr *)arg;
3822
3823         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3824             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3825                 return -1;
3826         }
3827
3828         /* Further clean up cached routes in exception table.
3829          * This is needed because cached route may have a different
3830          * gateway than its 'parent' in the case of an ip redirect.
3831          */
3832         rt6_exceptions_clean_tohost(rt, gateway);
3833
3834         return 0;
3835 }
3836
3837 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3838 {
3839         fib6_clean_all(net, fib6_clean_tohost, gateway);
3840 }
3841
3842 struct arg_netdev_event {
3843         const struct net_device *dev;
3844         union {
3845                 unsigned int nh_flags;
3846                 unsigned long event;
3847         };
3848 };
3849
3850 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3851 {
3852         struct fib6_info *iter;
3853         struct fib6_node *fn;
3854
3855         fn = rcu_dereference_protected(rt->fib6_node,
3856                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3857         iter = rcu_dereference_protected(fn->leaf,
3858                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3859         while (iter) {
3860                 if (iter->fib6_metric == rt->fib6_metric &&
3861                     rt6_qualify_for_ecmp(iter))
3862                         return iter;
3863                 iter = rcu_dereference_protected(iter->fib6_next,
3864                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3865         }
3866
3867         return NULL;
3868 }
3869
3870 static bool rt6_is_dead(const struct fib6_info *rt)
3871 {
3872         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3873             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3874              fib6_ignore_linkdown(rt)))
3875                 return true;
3876
3877         return false;
3878 }
3879
3880 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3881 {
3882         struct fib6_info *iter;
3883         int total = 0;
3884
3885         if (!rt6_is_dead(rt))
3886                 total += rt->fib6_nh.nh_weight;
3887
3888         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3889                 if (!rt6_is_dead(iter))
3890                         total += iter->fib6_nh.nh_weight;
3891         }
3892
3893         return total;
3894 }
3895
3896 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3897 {
3898         int upper_bound = -1;
3899
3900         if (!rt6_is_dead(rt)) {
3901                 *weight += rt->fib6_nh.nh_weight;
3902                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3903                                                     total) - 1;
3904         }
3905         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3906 }
3907
3908 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3909 {
3910         struct fib6_info *iter;
3911         int weight = 0;
3912
3913         rt6_upper_bound_set(rt, &weight, total);
3914
3915         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3916                 rt6_upper_bound_set(iter, &weight, total);
3917 }
3918
3919 void rt6_multipath_rebalance(struct fib6_info *rt)
3920 {
3921         struct fib6_info *first;
3922         int total;
3923
3924         /* In case the entire multipath route was marked for flushing,
3925          * then there is no need to rebalance upon the removal of every
3926          * sibling route.
3927          */
3928         if (!rt->fib6_nsiblings || rt->should_flush)
3929                 return;
3930
3931         /* During lookup routes are evaluated in order, so we need to
3932          * make sure upper bounds are assigned from the first sibling
3933          * onwards.
3934          */
3935         first = rt6_multipath_first_sibling(rt);
3936         if (WARN_ON_ONCE(!first))
3937                 return;
3938
3939         total = rt6_multipath_total_weight(first);
3940         rt6_multipath_upper_bound_set(first, total);
3941 }
3942
3943 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3944 {
3945         const struct arg_netdev_event *arg = p_arg;
3946         struct net *net = dev_net(arg->dev);
3947
3948         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3949                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3950                 fib6_update_sernum_upto_root(net, rt);
3951                 rt6_multipath_rebalance(rt);
3952         }
3953
3954         return 0;
3955 }
3956
3957 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3958 {
3959         struct arg_netdev_event arg = {
3960                 .dev = dev,
3961                 {
3962                         .nh_flags = nh_flags,
3963                 },
3964         };
3965
3966         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3967                 arg.nh_flags |= RTNH_F_LINKDOWN;
3968
3969         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3970 }
3971
3972 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3973                                    const struct net_device *dev)
3974 {
3975         struct fib6_info *iter;
3976
3977         if (rt->fib6_nh.nh_dev == dev)
3978                 return true;
3979         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980                 if (iter->fib6_nh.nh_dev == dev)
3981                         return true;
3982
3983         return false;
3984 }
3985
3986 static void rt6_multipath_flush(struct fib6_info *rt)
3987 {
3988         struct fib6_info *iter;
3989
3990         rt->should_flush = 1;
3991         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992                 iter->should_flush = 1;
3993 }
3994
3995 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3996                                              const struct net_device *down_dev)
3997 {
3998         struct fib6_info *iter;
3999         unsigned int dead = 0;
4000
4001         if (rt->fib6_nh.nh_dev == down_dev ||
4002             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4003                 dead++;
4004         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005                 if (iter->fib6_nh.nh_dev == down_dev ||
4006                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4007                         dead++;
4008
4009         return dead;
4010 }
4011
4012 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4013                                        const struct net_device *dev,
4014                                        unsigned int nh_flags)
4015 {
4016         struct fib6_info *iter;
4017
4018         if (rt->fib6_nh.nh_dev == dev)
4019                 rt->fib6_nh.nh_flags |= nh_flags;
4020         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4021                 if (iter->fib6_nh.nh_dev == dev)
4022                         iter->fib6_nh.nh_flags |= nh_flags;
4023 }
4024
4025 /* called with write lock held for table with rt */
4026 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4027 {
4028         const struct arg_netdev_event *arg = p_arg;
4029         const struct net_device *dev = arg->dev;
4030         struct net *net = dev_net(dev);
4031
4032         if (rt == net->ipv6.fib6_null_entry)
4033                 return 0;
4034
4035         switch (arg->event) {
4036         case NETDEV_UNREGISTER:
4037                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4038         case NETDEV_DOWN:
4039                 if (rt->should_flush)
4040                         return -1;
4041                 if (!rt->fib6_nsiblings)
4042                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4043                 if (rt6_multipath_uses_dev(rt, dev)) {
4044                         unsigned int count;
4045
4046                         count = rt6_multipath_dead_count(rt, dev);
4047                         if (rt->fib6_nsiblings + 1 == count) {
4048                                 rt6_multipath_flush(rt);
4049                                 return -1;
4050                         }
4051                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4052                                                    RTNH_F_LINKDOWN);
4053                         fib6_update_sernum(net, rt);
4054                         rt6_multipath_rebalance(rt);
4055                 }
4056                 return -2;
4057         case NETDEV_CHANGE:
4058                 if (rt->fib6_nh.nh_dev != dev ||
4059                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4060                         break;
4061                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4062                 rt6_multipath_rebalance(rt);
4063                 break;
4064         }
4065
4066         return 0;
4067 }
4068
4069 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4070 {
4071         struct arg_netdev_event arg = {
4072                 .dev = dev,
4073                 {
4074                         .event = event,
4075                 },
4076         };
4077
4078         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4079 }
4080
4081 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4082 {
4083         rt6_sync_down_dev(dev, event);
4084         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4085         neigh_ifdown(&nd_tbl, dev);
4086 }
4087
4088 struct rt6_mtu_change_arg {
4089         struct net_device *dev;
4090         unsigned int mtu;
4091 };
4092
4093 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4094 {
4095         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4096         struct inet6_dev *idev;
4097
4098         /* In IPv6 pmtu discovery is not optional,
4099            so that RTAX_MTU lock cannot disable it.
4100            We still use this lock to block changes
4101            caused by addrconf/ndisc.
4102         */
4103
4104         idev = __in6_dev_get(arg->dev);
4105         if (!idev)
4106                 return 0;
4107
4108         /* For administrative MTU increase, there is no way to discover
4109            IPv6 PMTU increase, so PMTU increase should be updated here.
4110            Since RFC 1981 doesn't include administrative MTU increase
4111            update PMTU increase is a MUST. (i.e. jumbo frame)
4112          */
4113         if (rt->fib6_nh.nh_dev == arg->dev &&
4114             !fib6_metric_locked(rt, RTAX_MTU)) {
4115                 u32 mtu = rt->fib6_pmtu;
4116
4117                 if (mtu >= arg->mtu ||
4118                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4119                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4120
4121                 spin_lock_bh(&rt6_exception_lock);
4122                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4123                 spin_unlock_bh(&rt6_exception_lock);
4124         }
4125         return 0;
4126 }
4127
4128 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4129 {
4130         struct rt6_mtu_change_arg arg = {
4131                 .dev = dev,
4132                 .mtu = mtu,
4133         };
4134
4135         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4136 }
4137
4138 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4139         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4140         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4141         [RTA_OIF]               = { .type = NLA_U32 },
4142         [RTA_IIF]               = { .type = NLA_U32 },
4143         [RTA_PRIORITY]          = { .type = NLA_U32 },
4144         [RTA_METRICS]           = { .type = NLA_NESTED },
4145         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4146         [RTA_PREF]              = { .type = NLA_U8 },
4147         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4148         [RTA_ENCAP]             = { .type = NLA_NESTED },
4149         [RTA_EXPIRES]           = { .type = NLA_U32 },
4150         [RTA_UID]               = { .type = NLA_U32 },
4151         [RTA_MARK]              = { .type = NLA_U32 },
4152         [RTA_TABLE]             = { .type = NLA_U32 },
4153         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4154         [RTA_SPORT]             = { .type = NLA_U16 },
4155         [RTA_DPORT]             = { .type = NLA_U16 },
4156 };
4157
4158 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4159                               struct fib6_config *cfg,
4160                               struct netlink_ext_ack *extack)
4161 {
4162         struct rtmsg *rtm;
4163         struct nlattr *tb[RTA_MAX+1];
4164         unsigned int pref;
4165         int err;
4166
4167         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4168                           NULL);
4169         if (err < 0)
4170                 goto errout;
4171
4172         err = -EINVAL;
4173         rtm = nlmsg_data(nlh);
4174         memset(cfg, 0, sizeof(*cfg));
4175
4176         cfg->fc_table = rtm->rtm_table;
4177         cfg->fc_dst_len = rtm->rtm_dst_len;
4178         cfg->fc_src_len = rtm->rtm_src_len;
4179         cfg->fc_flags = RTF_UP;
4180         cfg->fc_protocol = rtm->rtm_protocol;
4181         cfg->fc_type = rtm->rtm_type;
4182
4183         if (rtm->rtm_type == RTN_UNREACHABLE ||
4184             rtm->rtm_type == RTN_BLACKHOLE ||
4185             rtm->rtm_type == RTN_PROHIBIT ||
4186             rtm->rtm_type == RTN_THROW)
4187                 cfg->fc_flags |= RTF_REJECT;
4188
4189         if (rtm->rtm_type == RTN_LOCAL)
4190                 cfg->fc_flags |= RTF_LOCAL;
4191
4192         if (rtm->rtm_flags & RTM_F_CLONED)
4193                 cfg->fc_flags |= RTF_CACHE;
4194
4195         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4196
4197         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4198         cfg->fc_nlinfo.nlh = nlh;
4199         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4200
4201         if (tb[RTA_GATEWAY]) {
4202                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4203                 cfg->fc_flags |= RTF_GATEWAY;
4204         }
4205
4206         if (tb[RTA_DST]) {
4207                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4208
4209                 if (nla_len(tb[RTA_DST]) < plen)
4210                         goto errout;
4211
4212                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4213         }
4214
4215         if (tb[RTA_SRC]) {
4216                 int plen = (rtm->rtm_src_len + 7) >> 3;
4217
4218                 if (nla_len(tb[RTA_SRC]) < plen)
4219                         goto errout;
4220
4221                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4222         }
4223
4224         if (tb[RTA_PREFSRC])
4225                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4226
4227         if (tb[RTA_OIF])
4228                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4229
4230         if (tb[RTA_PRIORITY])
4231                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4232
4233         if (tb[RTA_METRICS]) {
4234                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4235                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4236         }
4237
4238         if (tb[RTA_TABLE])
4239                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4240
4241         if (tb[RTA_MULTIPATH]) {
4242                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4243                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4244
4245                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4246                                                      cfg->fc_mp_len, extack);
4247                 if (err < 0)
4248                         goto errout;
4249         }
4250
4251         if (tb[RTA_PREF]) {
4252                 pref = nla_get_u8(tb[RTA_PREF]);
4253                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4254                     pref != ICMPV6_ROUTER_PREF_HIGH)
4255                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4256                 cfg->fc_flags |= RTF_PREF(pref);
4257         }
4258
4259         if (tb[RTA_ENCAP])
4260                 cfg->fc_encap = tb[RTA_ENCAP];
4261
4262         if (tb[RTA_ENCAP_TYPE]) {
4263                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4264
4265                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4266                 if (err < 0)
4267                         goto errout;
4268         }
4269
4270         if (tb[RTA_EXPIRES]) {
4271                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4272
4273                 if (addrconf_finite_timeout(timeout)) {
4274                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4275                         cfg->fc_flags |= RTF_EXPIRES;
4276                 }
4277         }
4278
4279         err = 0;
4280 errout:
4281         return err;
4282 }
4283
4284 struct rt6_nh {
4285         struct fib6_info *fib6_info;
4286         struct fib6_config r_cfg;
4287         struct list_head next;
4288 };
4289
4290 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4291 {
4292         struct rt6_nh *nh;
4293
4294         list_for_each_entry(nh, rt6_nh_list, next) {
4295                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4296                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4297                         nh->r_cfg.fc_ifindex);
4298         }
4299 }
4300
4301 static int ip6_route_info_append(struct net *net,
4302                                  struct list_head *rt6_nh_list,
4303                                  struct fib6_info *rt,
4304                                  struct fib6_config *r_cfg)
4305 {
4306         struct rt6_nh *nh;
4307         int err = -EEXIST;
4308
4309         list_for_each_entry(nh, rt6_nh_list, next) {
4310                 /* check if fib6_info already exists */
4311                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4312                         return err;
4313         }
4314
4315         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4316         if (!nh)
4317                 return -ENOMEM;
4318         nh->fib6_info = rt;
4319         err = ip6_convert_metrics(net, rt, r_cfg);
4320         if (err) {
4321                 kfree(nh);
4322                 return err;
4323         }
4324         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4325         list_add_tail(&nh->next, rt6_nh_list);
4326
4327         return 0;
4328 }
4329
4330 static void ip6_route_mpath_notify(struct fib6_info *rt,
4331                                    struct fib6_info *rt_last,
4332                                    struct nl_info *info,
4333                                    __u16 nlflags)
4334 {
4335         /* if this is an APPEND route, then rt points to the first route
4336          * inserted and rt_last points to last route inserted. Userspace
4337          * wants a consistent dump of the route which starts at the first
4338          * nexthop. Since sibling routes are always added at the end of
4339          * the list, find the first sibling of the last route appended
4340          */
4341         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4342                 rt = list_first_entry(&rt_last->fib6_siblings,
4343                                       struct fib6_info,
4344                                       fib6_siblings);
4345         }
4346
4347         if (rt)
4348                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4349 }
4350
4351 static int ip6_route_multipath_add(struct fib6_config *cfg,
4352                                    struct netlink_ext_ack *extack)
4353 {
4354         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4355         struct nl_info *info = &cfg->fc_nlinfo;
4356         struct fib6_config r_cfg;
4357         struct rtnexthop *rtnh;
4358         struct fib6_info *rt;
4359         struct rt6_nh *err_nh;
4360         struct rt6_nh *nh, *nh_safe;
4361         __u16 nlflags;
4362         int remaining;
4363         int attrlen;
4364         int err = 1;
4365         int nhn = 0;
4366         int replace = (cfg->fc_nlinfo.nlh &&
4367                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4368         LIST_HEAD(rt6_nh_list);
4369
4370         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4371         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4372                 nlflags |= NLM_F_APPEND;
4373
4374         remaining = cfg->fc_mp_len;
4375         rtnh = (struct rtnexthop *)cfg->fc_mp;
4376
4377         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4378          * fib6_info structs per nexthop
4379          */
4380         while (rtnh_ok(rtnh, remaining)) {
4381                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4382                 if (rtnh->rtnh_ifindex)
4383                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4384
4385                 attrlen = rtnh_attrlen(rtnh);
4386                 if (attrlen > 0) {
4387                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4388
4389                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4390                         if (nla) {
4391                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4392                                 r_cfg.fc_flags |= RTF_GATEWAY;
4393                         }
4394                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4395                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4396                         if (nla)
4397                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4398                 }
4399
4400                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4401                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4402                 if (IS_ERR(rt)) {
4403                         err = PTR_ERR(rt);
4404                         rt = NULL;
4405                         goto cleanup;
4406                 }
4407                 if (!rt6_qualify_for_ecmp(rt)) {
4408                         err = -EINVAL;
4409                         NL_SET_ERR_MSG(extack,
4410                                        "Device only routes can not be added for IPv6 using the multipath API.");
4411                         fib6_info_release(rt);
4412                         goto cleanup;
4413                 }
4414
4415                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4416
4417                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4418                                             rt, &r_cfg);
4419                 if (err) {
4420                         fib6_info_release(rt);
4421                         goto cleanup;
4422                 }
4423
4424                 rtnh = rtnh_next(rtnh, &remaining);
4425         }
4426
4427         /* for add and replace send one notification with all nexthops.
4428          * Skip the notification in fib6_add_rt2node and send one with
4429          * the full route when done
4430          */
4431         info->skip_notify = 1;
4432
4433         err_nh = NULL;
4434         list_for_each_entry(nh, &rt6_nh_list, next) {
4435                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4436                 fib6_info_release(nh->fib6_info);
4437
4438                 if (!err) {
4439                         /* save reference to last route successfully inserted */
4440                         rt_last = nh->fib6_info;
4441
4442                         /* save reference to first route for notification */
4443                         if (!rt_notif)
4444                                 rt_notif = nh->fib6_info;
4445                 }
4446
4447                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4448                 nh->fib6_info = NULL;
4449                 if (err) {
4450                         if (replace && nhn)
4451                                 ip6_print_replace_route_err(&rt6_nh_list);
4452                         err_nh = nh;
4453                         goto add_errout;
4454                 }
4455
4456                 /* Because each route is added like a single route we remove
4457                  * these flags after the first nexthop: if there is a collision,
4458                  * we have already failed to add the first nexthop:
4459                  * fib6_add_rt2node() has rejected it; when replacing, old
4460                  * nexthops have been replaced by first new, the rest should
4461                  * be added to it.
4462                  */
4463                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4464                                                      NLM_F_REPLACE);
4465                 nhn++;
4466         }
4467
4468         /* success ... tell user about new route */
4469         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4470         goto cleanup;
4471
4472 add_errout:
4473         /* send notification for routes that were added so that
4474          * the delete notifications sent by ip6_route_del are
4475          * coherent
4476          */
4477         if (rt_notif)
4478                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4479
4480         /* Delete routes that were already added */
4481         list_for_each_entry(nh, &rt6_nh_list, next) {
4482                 if (err_nh == nh)
4483                         break;
4484                 ip6_route_del(&nh->r_cfg, extack);
4485         }
4486
4487 cleanup:
4488         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4489                 if (nh->fib6_info)
4490                         fib6_info_release(nh->fib6_info);
4491                 list_del(&nh->next);
4492                 kfree(nh);
4493         }
4494
4495         return err;
4496 }
4497
4498 static int ip6_route_multipath_del(struct fib6_config *cfg,
4499                                    struct netlink_ext_ack *extack)
4500 {
4501         struct fib6_config r_cfg;
4502         struct rtnexthop *rtnh;
4503         int remaining;
4504         int attrlen;
4505         int err = 1, last_err = 0;
4506
4507         remaining = cfg->fc_mp_len;
4508         rtnh = (struct rtnexthop *)cfg->fc_mp;
4509
4510         /* Parse a Multipath Entry */
4511         while (rtnh_ok(rtnh, remaining)) {
4512                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4513                 if (rtnh->rtnh_ifindex)
4514                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4515
4516                 attrlen = rtnh_attrlen(rtnh);
4517                 if (attrlen > 0) {
4518                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4519
4520                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4521                         if (nla) {
4522                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4523                                 r_cfg.fc_flags |= RTF_GATEWAY;
4524                         }
4525                 }
4526                 err = ip6_route_del(&r_cfg, extack);
4527                 if (err)
4528                         last_err = err;
4529
4530                 rtnh = rtnh_next(rtnh, &remaining);
4531         }
4532
4533         return last_err;
4534 }
4535
4536 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4537                               struct netlink_ext_ack *extack)
4538 {
4539         struct fib6_config cfg;
4540         int err;
4541
4542         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4543         if (err < 0)
4544                 return err;
4545
4546         if (cfg.fc_mp)
4547                 return ip6_route_multipath_del(&cfg, extack);
4548         else {
4549                 cfg.fc_delete_all_nh = 1;
4550                 return ip6_route_del(&cfg, extack);
4551         }
4552 }
4553
4554 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4555                               struct netlink_ext_ack *extack)
4556 {
4557         struct fib6_config cfg;
4558         int err;
4559
4560         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4561         if (err < 0)
4562                 return err;
4563
4564         if (cfg.fc_mp)
4565                 return ip6_route_multipath_add(&cfg, extack);
4566         else
4567                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4568 }
4569
4570 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4571 {
4572         int nexthop_len = 0;
4573
4574         if (rt->fib6_nsiblings) {
4575                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4576                             + NLA_ALIGN(sizeof(struct rtnexthop))
4577                             + nla_total_size(16) /* RTA_GATEWAY */
4578                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4579
4580                 nexthop_len *= rt->fib6_nsiblings;
4581         }
4582
4583         return NLMSG_ALIGN(sizeof(struct rtmsg))
4584                + nla_total_size(16) /* RTA_SRC */
4585                + nla_total_size(16) /* RTA_DST */
4586                + nla_total_size(16) /* RTA_GATEWAY */
4587                + nla_total_size(16) /* RTA_PREFSRC */
4588                + nla_total_size(4) /* RTA_TABLE */
4589                + nla_total_size(4) /* RTA_IIF */
4590                + nla_total_size(4) /* RTA_OIF */
4591                + nla_total_size(4) /* RTA_PRIORITY */
4592                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4593                + nla_total_size(sizeof(struct rta_cacheinfo))
4594                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4595                + nla_total_size(1) /* RTA_PREF */
4596                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4597                + nexthop_len;
4598 }
4599
4600 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4601                             unsigned int *flags, bool skip_oif)
4602 {
4603         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4604                 *flags |= RTNH_F_DEAD;
4605
4606         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4607                 *flags |= RTNH_F_LINKDOWN;
4608
4609                 rcu_read_lock();
4610                 if (fib6_ignore_linkdown(rt))
4611                         *flags |= RTNH_F_DEAD;
4612                 rcu_read_unlock();
4613         }
4614
4615         if (rt->fib6_flags & RTF_GATEWAY) {
4616                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4617                         goto nla_put_failure;
4618         }
4619
4620         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4621         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4622                 *flags |= RTNH_F_OFFLOAD;
4623
4624         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4625         if (!skip_oif && rt->fib6_nh.nh_dev &&
4626             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4627                 goto nla_put_failure;
4628
4629         if (rt->fib6_nh.nh_lwtstate &&
4630             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4631                 goto nla_put_failure;
4632
4633         return 0;
4634
4635 nla_put_failure:
4636         return -EMSGSIZE;
4637 }
4638
4639 /* add multipath next hop */
4640 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4641 {
4642         const struct net_device *dev = rt->fib6_nh.nh_dev;
4643         struct rtnexthop *rtnh;
4644         unsigned int flags = 0;
4645
4646         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4647         if (!rtnh)
4648                 goto nla_put_failure;
4649
4650         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4651         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4652
4653         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4654                 goto nla_put_failure;
4655
4656         rtnh->rtnh_flags = flags;
4657
4658         /* length of rtnetlink header + attributes */
4659         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4660
4661         return 0;
4662
4663 nla_put_failure:
4664         return -EMSGSIZE;
4665 }
4666
4667 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4668                          struct fib6_info *rt, struct dst_entry *dst,
4669                          struct in6_addr *dest, struct in6_addr *src,
4670                          int iif, int type, u32 portid, u32 seq,
4671                          unsigned int flags)
4672 {
4673         struct rtmsg *rtm;
4674         struct nlmsghdr *nlh;
4675         long expires = 0;
4676         u32 *pmetrics;
4677         u32 table;
4678
4679         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4680         if (!nlh)
4681                 return -EMSGSIZE;
4682
4683         rtm = nlmsg_data(nlh);
4684         rtm->rtm_family = AF_INET6;
4685         rtm->rtm_dst_len = rt->fib6_dst.plen;
4686         rtm->rtm_src_len = rt->fib6_src.plen;
4687         rtm->rtm_tos = 0;
4688         if (rt->fib6_table)
4689                 table = rt->fib6_table->tb6_id;
4690         else
4691                 table = RT6_TABLE_UNSPEC;
4692         rtm->rtm_table = table;
4693         if (nla_put_u32(skb, RTA_TABLE, table))
4694                 goto nla_put_failure;
4695
4696         rtm->rtm_type = rt->fib6_type;
4697         rtm->rtm_flags = 0;
4698         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4699         rtm->rtm_protocol = rt->fib6_protocol;
4700
4701         if (rt->fib6_flags & RTF_CACHE)
4702                 rtm->rtm_flags |= RTM_F_CLONED;
4703
4704         if (dest) {
4705                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4706                         goto nla_put_failure;
4707                 rtm->rtm_dst_len = 128;
4708         } else if (rtm->rtm_dst_len)
4709                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4710                         goto nla_put_failure;
4711 #ifdef CONFIG_IPV6_SUBTREES
4712         if (src) {
4713                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4714                         goto nla_put_failure;
4715                 rtm->rtm_src_len = 128;
4716         } else if (rtm->rtm_src_len &&
4717                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4718                 goto nla_put_failure;
4719 #endif
4720         if (iif) {
4721 #ifdef CONFIG_IPV6_MROUTE
4722                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4723                         int err = ip6mr_get_route(net, skb, rtm, portid);
4724
4725                         if (err == 0)
4726                                 return 0;
4727                         if (err < 0)
4728                                 goto nla_put_failure;
4729                 } else
4730 #endif
4731                         if (nla_put_u32(skb, RTA_IIF, iif))
4732                                 goto nla_put_failure;
4733         } else if (dest) {
4734                 struct in6_addr saddr_buf;
4735                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4736                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4737                         goto nla_put_failure;
4738         }
4739
4740         if (rt->fib6_prefsrc.plen) {
4741                 struct in6_addr saddr_buf;
4742                 saddr_buf = rt->fib6_prefsrc.addr;
4743                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4744                         goto nla_put_failure;
4745         }
4746
4747         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4748         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4749                 goto nla_put_failure;
4750
4751         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4752                 goto nla_put_failure;
4753
4754         /* For multipath routes, walk the siblings list and add
4755          * each as a nexthop within RTA_MULTIPATH.
4756          */
4757         if (rt->fib6_nsiblings) {
4758                 struct fib6_info *sibling, *next_sibling;
4759                 struct nlattr *mp;
4760
4761                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4762                 if (!mp)
4763                         goto nla_put_failure;
4764
4765                 if (rt6_add_nexthop(skb, rt) < 0)
4766                         goto nla_put_failure;
4767
4768                 list_for_each_entry_safe(sibling, next_sibling,
4769                                          &rt->fib6_siblings, fib6_siblings) {
4770                         if (rt6_add_nexthop(skb, sibling) < 0)
4771                                 goto nla_put_failure;
4772                 }
4773
4774                 nla_nest_end(skb, mp);
4775         } else {
4776                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4777                         goto nla_put_failure;
4778         }
4779
4780         if (rt->fib6_flags & RTF_EXPIRES) {
4781                 expires = dst ? dst->expires : rt->expires;
4782                 expires -= jiffies;
4783         }
4784
4785         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4786                 goto nla_put_failure;
4787
4788         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4789                 goto nla_put_failure;
4790
4791
4792         nlmsg_end(skb, nlh);
4793         return 0;
4794
4795 nla_put_failure:
4796         nlmsg_cancel(skb, nlh);
4797         return -EMSGSIZE;
4798 }
4799
4800 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4801 {
4802         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4803         struct net *net = arg->net;
4804
4805         if (rt == net->ipv6.fib6_null_entry)
4806                 return 0;
4807
4808         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4809                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4810
4811                 /* user wants prefix routes only */
4812                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4813                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4814                         /* success since this is not a prefix route */
4815                         return 1;
4816                 }
4817         }
4818
4819         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4820                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4821                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4822 }
4823
4824 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4825                               struct netlink_ext_ack *extack)
4826 {
4827         struct net *net = sock_net(in_skb->sk);
4828         struct nlattr *tb[RTA_MAX+1];
4829         int err, iif = 0, oif = 0;
4830         struct fib6_info *from;
4831         struct dst_entry *dst;
4832         struct rt6_info *rt;
4833         struct sk_buff *skb;
4834         struct rtmsg *rtm;
4835         struct flowi6 fl6;
4836         bool fibmatch;
4837
4838         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4839                           extack);
4840         if (err < 0)
4841                 goto errout;
4842
4843         err = -EINVAL;
4844         memset(&fl6, 0, sizeof(fl6));
4845         rtm = nlmsg_data(nlh);
4846         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4847         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4848
4849         if (tb[RTA_SRC]) {
4850                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4851                         goto errout;
4852
4853                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4854         }
4855
4856         if (tb[RTA_DST]) {
4857                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4858                         goto errout;
4859
4860                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4861         }
4862
4863         if (tb[RTA_IIF])
4864                 iif = nla_get_u32(tb[RTA_IIF]);
4865
4866         if (tb[RTA_OIF])
4867                 oif = nla_get_u32(tb[RTA_OIF]);
4868
4869         if (tb[RTA_MARK])
4870                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4871
4872         if (tb[RTA_UID])
4873                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4874                                            nla_get_u32(tb[RTA_UID]));
4875         else
4876                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4877
4878         if (tb[RTA_SPORT])
4879                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4880
4881         if (tb[RTA_DPORT])
4882                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4883
4884         if (tb[RTA_IP_PROTO]) {
4885                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4886                                                   &fl6.flowi6_proto, extack);
4887                 if (err)
4888                         goto errout;
4889         }
4890
4891         if (iif) {
4892                 struct net_device *dev;
4893                 int flags = 0;
4894
4895                 rcu_read_lock();
4896
4897                 dev = dev_get_by_index_rcu(net, iif);
4898                 if (!dev) {
4899                         rcu_read_unlock();
4900                         err = -ENODEV;
4901                         goto errout;
4902                 }
4903
4904                 fl6.flowi6_iif = iif;
4905
4906                 if (!ipv6_addr_any(&fl6.saddr))
4907                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4908
4909                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4910
4911                 rcu_read_unlock();
4912         } else {
4913                 fl6.flowi6_oif = oif;
4914
4915                 dst = ip6_route_output(net, NULL, &fl6);
4916         }
4917
4918
4919         rt = container_of(dst, struct rt6_info, dst);
4920         if (rt->dst.error) {
4921                 err = rt->dst.error;
4922                 ip6_rt_put(rt);
4923                 goto errout;
4924         }
4925
4926         if (rt == net->ipv6.ip6_null_entry) {
4927                 err = rt->dst.error;
4928                 ip6_rt_put(rt);
4929                 goto errout;
4930         }
4931
4932         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4933         if (!skb) {
4934                 ip6_rt_put(rt);
4935                 err = -ENOBUFS;
4936                 goto errout;
4937         }
4938
4939         skb_dst_set(skb, &rt->dst);
4940
4941         rcu_read_lock();
4942         from = rcu_dereference(rt->from);
4943
4944         if (fibmatch)
4945                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4946                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4947                                     nlh->nlmsg_seq, 0);
4948         else
4949                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4950                                     &fl6.saddr, iif, RTM_NEWROUTE,
4951                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4952                                     0);
4953         rcu_read_unlock();
4954
4955         if (err < 0) {
4956                 kfree_skb(skb);
4957                 goto errout;
4958         }
4959
4960         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4961 errout:
4962         return err;
4963 }
4964
4965 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4966                      unsigned int nlm_flags)
4967 {
4968         struct sk_buff *skb;
4969         struct net *net = info->nl_net;
4970         u32 seq;
4971         int err;
4972
4973         err = -ENOBUFS;
4974         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4975
4976         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4977         if (!skb)
4978                 goto errout;
4979
4980         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4981                             event, info->portid, seq, nlm_flags);
4982         if (err < 0) {
4983                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4984                 WARN_ON(err == -EMSGSIZE);
4985                 kfree_skb(skb);
4986                 goto errout;
4987         }
4988         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4989                     info->nlh, gfp_any());
4990         return;
4991 errout:
4992         if (err < 0)
4993                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4994 }
4995
4996 static int ip6_route_dev_notify(struct notifier_block *this,
4997                                 unsigned long event, void *ptr)
4998 {
4999         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5000         struct net *net = dev_net(dev);
5001
5002         if (!(dev->flags & IFF_LOOPBACK))
5003                 return NOTIFY_OK;
5004
5005         if (event == NETDEV_REGISTER) {
5006                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5007                 net->ipv6.ip6_null_entry->dst.dev = dev;
5008                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5010                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5011                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5012                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5013                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5014 #endif
5015          } else if (event == NETDEV_UNREGISTER &&
5016                     dev->reg_state != NETREG_UNREGISTERED) {
5017                 /* NETDEV_UNREGISTER could be fired for multiple times by
5018                  * netdev_wait_allrefs(). Make sure we only call this once.
5019                  */
5020                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5021 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5022                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5023                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5024 #endif
5025         }
5026
5027         return NOTIFY_OK;
5028 }
5029
5030 /*
5031  *      /proc
5032  */
5033
5034 #ifdef CONFIG_PROC_FS
5035 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5036 {
5037         struct net *net = (struct net *)seq->private;
5038         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5039                    net->ipv6.rt6_stats->fib_nodes,
5040                    net->ipv6.rt6_stats->fib_route_nodes,
5041                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5042                    net->ipv6.rt6_stats->fib_rt_entries,
5043                    net->ipv6.rt6_stats->fib_rt_cache,
5044                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5045                    net->ipv6.rt6_stats->fib_discarded_routes);
5046
5047         return 0;
5048 }
5049 #endif  /* CONFIG_PROC_FS */
5050
5051 #ifdef CONFIG_SYSCTL
5052
5053 static
5054 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5055                               void __user *buffer, size_t *lenp, loff_t *ppos)
5056 {
5057         struct net *net;
5058         int delay;
5059         if (!write)
5060                 return -EINVAL;
5061
5062         net = (struct net *)ctl->extra1;
5063         delay = net->ipv6.sysctl.flush_delay;
5064         proc_dointvec(ctl, write, buffer, lenp, ppos);
5065         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5066         return 0;
5067 }
5068
5069 struct ctl_table ipv6_route_table_template[] = {
5070         {
5071                 .procname       =       "flush",
5072                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5073                 .maxlen         =       sizeof(int),
5074                 .mode           =       0200,
5075                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5076         },
5077         {
5078                 .procname       =       "gc_thresh",
5079                 .data           =       &ip6_dst_ops_template.gc_thresh,
5080                 .maxlen         =       sizeof(int),
5081                 .mode           =       0644,
5082                 .proc_handler   =       proc_dointvec,
5083         },
5084         {
5085                 .procname       =       "max_size",
5086                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5087                 .maxlen         =       sizeof(int),
5088                 .mode           =       0644,
5089                 .proc_handler   =       proc_dointvec,
5090         },
5091         {
5092                 .procname       =       "gc_min_interval",
5093                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5094                 .maxlen         =       sizeof(int),
5095                 .mode           =       0644,
5096                 .proc_handler   =       proc_dointvec_jiffies,
5097         },
5098         {
5099                 .procname       =       "gc_timeout",
5100                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5101                 .maxlen         =       sizeof(int),
5102                 .mode           =       0644,
5103                 .proc_handler   =       proc_dointvec_jiffies,
5104         },
5105         {
5106                 .procname       =       "gc_interval",
5107                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5108                 .maxlen         =       sizeof(int),
5109                 .mode           =       0644,
5110                 .proc_handler   =       proc_dointvec_jiffies,
5111         },
5112         {
5113                 .procname       =       "gc_elasticity",
5114                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5115                 .maxlen         =       sizeof(int),
5116                 .mode           =       0644,
5117                 .proc_handler   =       proc_dointvec,
5118         },
5119         {
5120                 .procname       =       "mtu_expires",
5121                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5122                 .maxlen         =       sizeof(int),
5123                 .mode           =       0644,
5124                 .proc_handler   =       proc_dointvec_jiffies,
5125         },
5126         {
5127                 .procname       =       "min_adv_mss",
5128                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5129                 .maxlen         =       sizeof(int),
5130                 .mode           =       0644,
5131                 .proc_handler   =       proc_dointvec,
5132         },
5133         {
5134                 .procname       =       "gc_min_interval_ms",
5135                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5136                 .maxlen         =       sizeof(int),
5137                 .mode           =       0644,
5138                 .proc_handler   =       proc_dointvec_ms_jiffies,
5139         },
5140         { }
5141 };
5142
5143 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5144 {
5145         struct ctl_table *table;
5146
5147         table = kmemdup(ipv6_route_table_template,
5148                         sizeof(ipv6_route_table_template),
5149                         GFP_KERNEL);
5150
5151         if (table) {
5152                 table[0].data = &net->ipv6.sysctl.flush_delay;
5153                 table[0].extra1 = net;
5154                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5155                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5156                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5157                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5158                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5159                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5160                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5161                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5162                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5163
5164                 /* Don't export sysctls to unprivileged users */
5165                 if (net->user_ns != &init_user_ns)
5166                         table[0].procname = NULL;
5167         }
5168
5169         return table;
5170 }
5171 #endif
5172
5173 static int __net_init ip6_route_net_init(struct net *net)
5174 {
5175         int ret = -ENOMEM;
5176
5177         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5178                sizeof(net->ipv6.ip6_dst_ops));
5179
5180         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5181                 goto out_ip6_dst_ops;
5182
5183         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5184                                             sizeof(*net->ipv6.fib6_null_entry),
5185                                             GFP_KERNEL);
5186         if (!net->ipv6.fib6_null_entry)
5187                 goto out_ip6_dst_entries;
5188
5189         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5190                                            sizeof(*net->ipv6.ip6_null_entry),
5191                                            GFP_KERNEL);
5192         if (!net->ipv6.ip6_null_entry)
5193                 goto out_fib6_null_entry;
5194         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5195         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5196                          ip6_template_metrics, true);
5197
5198 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5199         net->ipv6.fib6_has_custom_rules = false;
5200         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5201                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5202                                                GFP_KERNEL);
5203         if (!net->ipv6.ip6_prohibit_entry)
5204                 goto out_ip6_null_entry;
5205         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5206         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5207                          ip6_template_metrics, true);
5208
5209         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5210                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5211                                                GFP_KERNEL);
5212         if (!net->ipv6.ip6_blk_hole_entry)
5213                 goto out_ip6_prohibit_entry;
5214         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5215         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5216                          ip6_template_metrics, true);
5217 #endif
5218
5219         net->ipv6.sysctl.flush_delay = 0;
5220         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5221         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5222         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5223         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5224         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5225         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5226         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5227
5228         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5229
5230         ret = 0;
5231 out:
5232         return ret;
5233
5234 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5235 out_ip6_prohibit_entry:
5236         kfree(net->ipv6.ip6_prohibit_entry);
5237 out_ip6_null_entry:
5238         kfree(net->ipv6.ip6_null_entry);
5239 #endif
5240 out_fib6_null_entry:
5241         kfree(net->ipv6.fib6_null_entry);
5242 out_ip6_dst_entries:
5243         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5244 out_ip6_dst_ops:
5245         goto out;
5246 }
5247
5248 static void __net_exit ip6_route_net_exit(struct net *net)
5249 {
5250         kfree(net->ipv6.fib6_null_entry);
5251         kfree(net->ipv6.ip6_null_entry);
5252 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5253         kfree(net->ipv6.ip6_prohibit_entry);
5254         kfree(net->ipv6.ip6_blk_hole_entry);
5255 #endif
5256         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5257 }
5258
5259 static int __net_init ip6_route_net_init_late(struct net *net)
5260 {
5261 #ifdef CONFIG_PROC_FS
5262         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5263                         sizeof(struct ipv6_route_iter));
5264         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5265                         rt6_stats_seq_show, NULL);
5266 #endif
5267         return 0;
5268 }
5269
5270 static void __net_exit ip6_route_net_exit_late(struct net *net)
5271 {
5272 #ifdef CONFIG_PROC_FS
5273         remove_proc_entry("ipv6_route", net->proc_net);
5274         remove_proc_entry("rt6_stats", net->proc_net);
5275 #endif
5276 }
5277
5278 static struct pernet_operations ip6_route_net_ops = {
5279         .init = ip6_route_net_init,
5280         .exit = ip6_route_net_exit,
5281 };
5282
5283 static int __net_init ipv6_inetpeer_init(struct net *net)
5284 {
5285         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5286
5287         if (!bp)
5288                 return -ENOMEM;
5289         inet_peer_base_init(bp);
5290         net->ipv6.peers = bp;
5291         return 0;
5292 }
5293
5294 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5295 {
5296         struct inet_peer_base *bp = net->ipv6.peers;
5297
5298         net->ipv6.peers = NULL;
5299         inetpeer_invalidate_tree(bp);
5300         kfree(bp);
5301 }
5302
5303 static struct pernet_operations ipv6_inetpeer_ops = {
5304         .init   =       ipv6_inetpeer_init,
5305         .exit   =       ipv6_inetpeer_exit,
5306 };
5307
5308 static struct pernet_operations ip6_route_net_late_ops = {
5309         .init = ip6_route_net_init_late,
5310         .exit = ip6_route_net_exit_late,
5311 };
5312
5313 static struct notifier_block ip6_route_dev_notifier = {
5314         .notifier_call = ip6_route_dev_notify,
5315         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5316 };
5317
5318 void __init ip6_route_init_special_entries(void)
5319 {
5320         /* Registering of the loopback is done before this portion of code,
5321          * the loopback reference in rt6_info will not be taken, do it
5322          * manually for init_net */
5323         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5324         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5325         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5326   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5327         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5328         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5329         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5330         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5331   #endif
5332 }
5333
5334 int __init ip6_route_init(void)
5335 {
5336         int ret;
5337         int cpu;
5338
5339         ret = -ENOMEM;
5340         ip6_dst_ops_template.kmem_cachep =
5341                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5342                                   SLAB_HWCACHE_ALIGN, NULL);
5343         if (!ip6_dst_ops_template.kmem_cachep)
5344                 goto out;
5345
5346         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5347         if (ret)
5348                 goto out_kmem_cache;
5349
5350         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5351         if (ret)
5352                 goto out_dst_entries;
5353
5354         ret = register_pernet_subsys(&ip6_route_net_ops);
5355         if (ret)
5356                 goto out_register_inetpeer;
5357
5358         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5359
5360         ret = fib6_init();
5361         if (ret)
5362                 goto out_register_subsys;
5363
5364         ret = xfrm6_init();
5365         if (ret)
5366                 goto out_fib6_init;
5367
5368         ret = fib6_rules_init();
5369         if (ret)
5370                 goto xfrm6_init;
5371
5372         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5373         if (ret)
5374                 goto fib6_rules_init;
5375
5376         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5377                                    inet6_rtm_newroute, NULL, 0);
5378         if (ret < 0)
5379                 goto out_register_late_subsys;
5380
5381         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5382                                    inet6_rtm_delroute, NULL, 0);
5383         if (ret < 0)
5384                 goto out_register_late_subsys;
5385
5386         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5387                                    inet6_rtm_getroute, NULL,
5388                                    RTNL_FLAG_DOIT_UNLOCKED);
5389         if (ret < 0)
5390                 goto out_register_late_subsys;
5391
5392         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5393         if (ret)
5394                 goto out_register_late_subsys;
5395
5396         for_each_possible_cpu(cpu) {
5397                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5398
5399                 INIT_LIST_HEAD(&ul->head);
5400                 spin_lock_init(&ul->lock);
5401         }
5402
5403 out:
5404         return ret;
5405
5406 out_register_late_subsys:
5407         rtnl_unregister_all(PF_INET6);
5408         unregister_pernet_subsys(&ip6_route_net_late_ops);
5409 fib6_rules_init:
5410         fib6_rules_cleanup();
5411 xfrm6_init:
5412         xfrm6_fini();
5413 out_fib6_init:
5414         fib6_gc_cleanup();
5415 out_register_subsys:
5416         unregister_pernet_subsys(&ip6_route_net_ops);
5417 out_register_inetpeer:
5418         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5419 out_dst_entries:
5420         dst_entries_destroy(&ip6_dst_blackhole_ops);
5421 out_kmem_cache:
5422         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5423         goto out;
5424 }
5425
5426 void ip6_route_cleanup(void)
5427 {
5428         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5429         unregister_pernet_subsys(&ip6_route_net_late_ops);
5430         fib6_rules_cleanup();
5431         xfrm6_fini();
5432         fib6_gc_cleanup();
5433         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5434         unregister_pernet_subsys(&ip6_route_net_ops);
5435         dst_entries_destroy(&ip6_dst_blackhole_ops);
5436         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5437 }