86fb2411e2bda7d148a6467411b4d025bd4c397c
[platform/kernel/linux-starfive.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->rt6i_uncached_list = ul;
132
133         spin_lock_bh(&ul->lock);
134         list_add_tail(&rt->rt6i_uncached, &ul->head);
135         spin_unlock_bh(&ul->lock);
136 }
137
138 static void rt6_uncached_list_del(struct rt6_info *rt)
139 {
140         if (!list_empty(&rt->rt6i_uncached)) {
141                 struct uncached_list *ul = rt->rt6i_uncached_list;
142
143                 spin_lock_bh(&ul->lock);
144                 list_del(&rt->rt6i_uncached);
145                 spin_unlock_bh(&ul->lock);
146         }
147 }
148
149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
150 {
151         struct net_device *loopback_dev = net->loopback_dev;
152         int cpu;
153
154         if (dev == loopback_dev)
155                 return;
156
157         for_each_possible_cpu(cpu) {
158                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
159                 struct rt6_info *rt;
160
161                 spin_lock_bh(&ul->lock);
162                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
163                         struct inet6_dev *rt_idev = rt->rt6i_idev;
164                         struct net_device *rt_dev = rt->dst.dev;
165
166                         if (rt_idev->dev == dev) {
167                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
168                                 in6_dev_put(rt_idev);
169                         }
170
171                         if (rt_dev == dev) {
172                                 rt->dst.dev = loopback_dev;
173                                 dev_hold(rt->dst.dev);
174                                 dev_put(rt_dev);
175                         }
176                 }
177                 spin_unlock_bh(&ul->lock);
178         }
179 }
180
181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
182 {
183         return dst_metrics_write_ptr(rt->dst.from);
184 }
185
186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
187 {
188         struct rt6_info *rt = (struct rt6_info *)dst;
189
190         if (rt->rt6i_flags & RTF_PCPU)
191                 return rt6_pcpu_cow_metrics(rt);
192         else if (rt->rt6i_flags & RTF_CACHE)
193                 return NULL;
194         else
195                 return dst_cow_metrics_generic(dst, old);
196 }
197
198 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
199                                              struct sk_buff *skb,
200                                              const void *daddr)
201 {
202         struct in6_addr *p = &rt->rt6i_gateway;
203
204         if (!ipv6_addr_any(p))
205                 return (const void *) p;
206         else if (skb)
207                 return &ipv6_hdr(skb)->daddr;
208         return daddr;
209 }
210
211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
212                                           struct sk_buff *skb,
213                                           const void *daddr)
214 {
215         struct rt6_info *rt = (struct rt6_info *) dst;
216         struct neighbour *n;
217
218         daddr = choose_neigh_daddr(rt, skb, daddr);
219         n = __ipv6_neigh_lookup(dst->dev, daddr);
220         if (n)
221                 return n;
222         return neigh_create(&nd_tbl, daddr, dst->dev);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(rt, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       ipv6_cow_metrics,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct rt6_info ip6_null_entry_template = {
293         .dst = {
294                 .__refcnt       = ATOMIC_INIT(1),
295                 .__use          = 1,
296                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
297                 .error          = -ENETUNREACH,
298                 .input          = ip6_pkt_discard,
299                 .output         = ip6_pkt_discard_out,
300         },
301         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
302         .rt6i_protocol  = RTPROT_KERNEL,
303         .rt6i_metric    = ~(u32) 0,
304         .rt6i_ref       = ATOMIC_INIT(1),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319         .rt6i_protocol  = RTPROT_KERNEL,
320         .rt6i_metric    = ~(u32) 0,
321         .rt6i_ref       = ATOMIC_INIT(1),
322 };
323
324 static const struct rt6_info ip6_blk_hole_entry_template = {
325         .dst = {
326                 .__refcnt       = ATOMIC_INIT(1),
327                 .__use          = 1,
328                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
329                 .error          = -EINVAL,
330                 .input          = dst_discard,
331                 .output         = dst_discard_out,
332         },
333         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
334         .rt6i_protocol  = RTPROT_KERNEL,
335         .rt6i_metric    = ~(u32) 0,
336         .rt6i_ref       = ATOMIC_INIT(1),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_siblings);
347         INIT_LIST_HEAD(&rt->rt6i_uncached);
348 }
349
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info *__ip6_dst_alloc(struct net *net,
352                                         struct net_device *dev,
353                                         int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt)
359                 rt6_info_init(rt);
360
361         return rt;
362 }
363
364 struct rt6_info *ip6_dst_alloc(struct net *net,
365                                struct net_device *dev,
366                                int flags)
367 {
368         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
369
370         if (rt) {
371                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
372                 if (rt->rt6i_pcpu) {
373                         int cpu;
374
375                         for_each_possible_cpu(cpu) {
376                                 struct rt6_info **p;
377
378                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
379                                 /* no one shares rt */
380                                 *p =  NULL;
381                         }
382                 } else {
383                         dst_release_immediate(&rt->dst);
384                         return NULL;
385                 }
386         }
387
388         return rt;
389 }
390 EXPORT_SYMBOL(ip6_dst_alloc);
391
392 static void ip6_dst_destroy(struct dst_entry *dst)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct dst_entry *from = dst->from;
396         struct inet6_dev *idev;
397
398         dst_destroy_metrics_generic(dst);
399         free_percpu(rt->rt6i_pcpu);
400         rt6_uncached_list_del(rt);
401
402         idev = rt->rt6i_idev;
403         if (idev) {
404                 rt->rt6i_idev = NULL;
405                 in6_dev_put(idev);
406         }
407
408         dst->from = NULL;
409         dst_release(from);
410 }
411
412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
413                            int how)
414 {
415         struct rt6_info *rt = (struct rt6_info *)dst;
416         struct inet6_dev *idev = rt->rt6i_idev;
417         struct net_device *loopback_dev =
418                 dev_net(dev)->loopback_dev;
419
420         if (idev && idev->dev != loopback_dev) {
421                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
422                 if (loopback_idev) {
423                         rt->rt6i_idev = loopback_idev;
424                         in6_dev_put(idev);
425                 }
426         }
427 }
428
429 static bool __rt6_check_expired(const struct rt6_info *rt)
430 {
431         if (rt->rt6i_flags & RTF_EXPIRES)
432                 return time_after(jiffies, rt->dst.expires);
433         else
434                 return false;
435 }
436
437 static bool rt6_check_expired(const struct rt6_info *rt)
438 {
439         if (rt->rt6i_flags & RTF_EXPIRES) {
440                 if (time_after(jiffies, rt->dst.expires))
441                         return true;
442         } else if (rt->dst.from) {
443                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
444         }
445         return false;
446 }
447
448 /* Multipath route selection:
449  *   Hash based function using packet header and flowlabel.
450  * Adapted from fib_info_hashfn()
451  */
452 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
453                                const struct flowi6 *fl6)
454 {
455         return get_hash_from_flowi6(fl6) % candidate_count;
456 }
457
458 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
459                                              struct flowi6 *fl6, int oif,
460                                              int strict)
461 {
462         struct rt6_info *sibling, *next_sibling;
463         int route_choosen;
464
465         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
466         /* Don't change the route, if route_choosen == 0
467          * (siblings does not include ourself)
468          */
469         if (route_choosen)
470                 list_for_each_entry_safe(sibling, next_sibling,
471                                 &match->rt6i_siblings, rt6i_siblings) {
472                         route_choosen--;
473                         if (route_choosen == 0) {
474                                 if (rt6_score_route(sibling, oif, strict) < 0)
475                                         break;
476                                 match = sibling;
477                                 break;
478                         }
479                 }
480         return match;
481 }
482
483 /*
484  *      Route lookup. Any table->tb6_lock is implied.
485  */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488                                                     struct rt6_info *rt,
489                                                     const struct in6_addr *saddr,
490                                                     int oif,
491                                                     int flags)
492 {
493         struct rt6_info *local = NULL;
494         struct rt6_info *sprt;
495
496         if (!oif && ipv6_addr_any(saddr))
497                 goto out;
498
499         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
500                 struct net_device *dev = sprt->dst.dev;
501
502                 if (oif) {
503                         if (dev->ifindex == oif)
504                                 return sprt;
505                         if (dev->flags & IFF_LOOPBACK) {
506                                 if (!sprt->rt6i_idev ||
507                                     sprt->rt6i_idev->dev->ifindex != oif) {
508                                         if (flags & RT6_LOOKUP_F_IFACE)
509                                                 continue;
510                                         if (local &&
511                                             local->rt6i_idev->dev->ifindex == oif)
512                                                 continue;
513                                 }
514                                 local = sprt;
515                         }
516                 } else {
517                         if (ipv6_chk_addr(net, saddr, dev,
518                                           flags & RT6_LOOKUP_F_IFACE))
519                                 return sprt;
520                 }
521         }
522
523         if (oif) {
524                 if (local)
525                         return local;
526
527                 if (flags & RT6_LOOKUP_F_IFACE)
528                         return net->ipv6.ip6_null_entry;
529         }
530 out:
531         return rt;
532 }
533
534 #ifdef CONFIG_IPV6_ROUTER_PREF
535 struct __rt6_probe_work {
536         struct work_struct work;
537         struct in6_addr target;
538         struct net_device *dev;
539 };
540
541 static void rt6_probe_deferred(struct work_struct *w)
542 {
543         struct in6_addr mcaddr;
544         struct __rt6_probe_work *work =
545                 container_of(w, struct __rt6_probe_work, work);
546
547         addrconf_addr_solict_mult(&work->target, &mcaddr);
548         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
549         dev_put(work->dev);
550         kfree(work);
551 }
552
553 static void rt6_probe(struct rt6_info *rt)
554 {
555         struct __rt6_probe_work *work;
556         struct neighbour *neigh;
557         /*
558          * Okay, this does not seem to be appropriate
559          * for now, however, we need to check if it
560          * is really so; aka Router Reachability Probing.
561          *
562          * Router Reachability Probe MUST be rate-limited
563          * to no more than one per minute.
564          */
565         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
566                 return;
567         rcu_read_lock_bh();
568         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
569         if (neigh) {
570                 if (neigh->nud_state & NUD_VALID)
571                         goto out;
572
573                 work = NULL;
574                 write_lock(&neigh->lock);
575                 if (!(neigh->nud_state & NUD_VALID) &&
576                     time_after(jiffies,
577                                neigh->updated +
578                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
579                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
580                         if (work)
581                                 __neigh_set_probe_once(neigh);
582                 }
583                 write_unlock(&neigh->lock);
584         } else {
585                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
586         }
587
588         if (work) {
589                 INIT_WORK(&work->work, rt6_probe_deferred);
590                 work->target = rt->rt6i_gateway;
591                 dev_hold(rt->dst.dev);
592                 work->dev = rt->dst.dev;
593                 schedule_work(&work->work);
594         }
595
596 out:
597         rcu_read_unlock_bh();
598 }
599 #else
600 static inline void rt6_probe(struct rt6_info *rt)
601 {
602 }
603 #endif
604
605 /*
606  * Default Router Selection (RFC 2461 6.3.6)
607  */
608 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
609 {
610         struct net_device *dev = rt->dst.dev;
611         if (!oif || dev->ifindex == oif)
612                 return 2;
613         if ((dev->flags & IFF_LOOPBACK) &&
614             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
615                 return 1;
616         return 0;
617 }
618
619 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
620 {
621         struct neighbour *neigh;
622         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
623
624         if (rt->rt6i_flags & RTF_NONEXTHOP ||
625             !(rt->rt6i_flags & RTF_GATEWAY))
626                 return RT6_NUD_SUCCEED;
627
628         rcu_read_lock_bh();
629         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
630         if (neigh) {
631                 read_lock(&neigh->lock);
632                 if (neigh->nud_state & NUD_VALID)
633                         ret = RT6_NUD_SUCCEED;
634 #ifdef CONFIG_IPV6_ROUTER_PREF
635                 else if (!(neigh->nud_state & NUD_FAILED))
636                         ret = RT6_NUD_SUCCEED;
637                 else
638                         ret = RT6_NUD_FAIL_PROBE;
639 #endif
640                 read_unlock(&neigh->lock);
641         } else {
642                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
643                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
644         }
645         rcu_read_unlock_bh();
646
647         return ret;
648 }
649
650 static int rt6_score_route(struct rt6_info *rt, int oif,
651                            int strict)
652 {
653         int m;
654
655         m = rt6_check_dev(rt, oif);
656         if (!m && (strict & RT6_LOOKUP_F_IFACE))
657                 return RT6_NUD_FAIL_HARD;
658 #ifdef CONFIG_IPV6_ROUTER_PREF
659         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
660 #endif
661         if (strict & RT6_LOOKUP_F_REACHABLE) {
662                 int n = rt6_check_neigh(rt);
663                 if (n < 0)
664                         return n;
665         }
666         return m;
667 }
668
669 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
670                                    int *mpri, struct rt6_info *match,
671                                    bool *do_rr)
672 {
673         int m;
674         bool match_do_rr = false;
675         struct inet6_dev *idev = rt->rt6i_idev;
676         struct net_device *dev = rt->dst.dev;
677
678         if (dev && !netif_carrier_ok(dev) &&
679             idev->cnf.ignore_routes_with_linkdown &&
680             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
681                 goto out;
682
683         if (rt6_check_expired(rt))
684                 goto out;
685
686         m = rt6_score_route(rt, oif, strict);
687         if (m == RT6_NUD_FAIL_DO_RR) {
688                 match_do_rr = true;
689                 m = 0; /* lowest valid score */
690         } else if (m == RT6_NUD_FAIL_HARD) {
691                 goto out;
692         }
693
694         if (strict & RT6_LOOKUP_F_REACHABLE)
695                 rt6_probe(rt);
696
697         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
698         if (m > *mpri) {
699                 *do_rr = match_do_rr;
700                 *mpri = m;
701                 match = rt;
702         }
703 out:
704         return match;
705 }
706
707 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
708                                      struct rt6_info *rr_head,
709                                      u32 metric, int oif, int strict,
710                                      bool *do_rr)
711 {
712         struct rt6_info *rt, *match, *cont;
713         int mpri = -1;
714
715         match = NULL;
716         cont = NULL;
717         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
718                 if (rt->rt6i_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
727                 if (rt->rt6i_metric != metric) {
728                         cont = rt;
729                         break;
730                 }
731
732                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733         }
734
735         if (match || !cont)
736                 return match;
737
738         for (rt = cont; rt; rt = rt->dst.rt6_next)
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740
741         return match;
742 }
743
744 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
745 {
746         struct rt6_info *match, *rt0;
747         struct net *net;
748         bool do_rr = false;
749
750         rt0 = fn->rr_ptr;
751         if (!rt0)
752                 fn->rr_ptr = rt0 = fn->leaf;
753
754         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
755                              &do_rr);
756
757         if (do_rr) {
758                 struct rt6_info *next = rt0->dst.rt6_next;
759
760                 /* no entries matched; do round-robin */
761                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
762                         next = fn->leaf;
763
764                 if (next != rt0)
765                         fn->rr_ptr = next;
766         }
767
768         net = dev_net(rt0->dst.dev);
769         return match ? match : net->ipv6.ip6_null_entry;
770 }
771
772 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
773 {
774         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
775 }
776
777 #ifdef CONFIG_IPV6_ROUTE_INFO
778 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
779                   const struct in6_addr *gwaddr)
780 {
781         struct net *net = dev_net(dev);
782         struct route_info *rinfo = (struct route_info *) opt;
783         struct in6_addr prefix_buf, *prefix;
784         unsigned int pref;
785         unsigned long lifetime;
786         struct rt6_info *rt;
787
788         if (len < sizeof(struct route_info)) {
789                 return -EINVAL;
790         }
791
792         /* Sanity check for prefix_len and length */
793         if (rinfo->length > 3) {
794                 return -EINVAL;
795         } else if (rinfo->prefix_len > 128) {
796                 return -EINVAL;
797         } else if (rinfo->prefix_len > 64) {
798                 if (rinfo->length < 2) {
799                         return -EINVAL;
800                 }
801         } else if (rinfo->prefix_len > 0) {
802                 if (rinfo->length < 1) {
803                         return -EINVAL;
804                 }
805         }
806
807         pref = rinfo->route_pref;
808         if (pref == ICMPV6_ROUTER_PREF_INVALID)
809                 return -EINVAL;
810
811         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
812
813         if (rinfo->length == 3)
814                 prefix = (struct in6_addr *)rinfo->prefix;
815         else {
816                 /* this function is safe */
817                 ipv6_addr_prefix(&prefix_buf,
818                                  (struct in6_addr *)rinfo->prefix,
819                                  rinfo->prefix_len);
820                 prefix = &prefix_buf;
821         }
822
823         if (rinfo->prefix_len == 0)
824                 rt = rt6_get_dflt_router(gwaddr, dev);
825         else
826                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
827                                         gwaddr, dev);
828
829         if (rt && !lifetime) {
830                 ip6_del_rt(rt);
831                 rt = NULL;
832         }
833
834         if (!rt && lifetime)
835                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
836                                         dev, pref);
837         else if (rt)
838                 rt->rt6i_flags = RTF_ROUTEINFO |
839                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
840
841         if (rt) {
842                 if (!addrconf_finite_timeout(lifetime))
843                         rt6_clean_expires(rt);
844                 else
845                         rt6_set_expires(rt, jiffies + HZ * lifetime);
846
847                 ip6_rt_put(rt);
848         }
849         return 0;
850 }
851 #endif
852
853 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
854                                         struct in6_addr *saddr)
855 {
856         struct fib6_node *pn;
857         while (1) {
858                 if (fn->fn_flags & RTN_TL_ROOT)
859                         return NULL;
860                 pn = fn->parent;
861                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
862                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
863                 else
864                         fn = pn;
865                 if (fn->fn_flags & RTN_RTINFO)
866                         return fn;
867         }
868 }
869
870 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
871                                              struct fib6_table *table,
872                                              struct flowi6 *fl6, int flags)
873 {
874         struct fib6_node *fn;
875         struct rt6_info *rt;
876
877         read_lock_bh(&table->tb6_lock);
878         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
879 restart:
880         rt = fn->leaf;
881         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
882         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
883                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
884         if (rt == net->ipv6.ip6_null_entry) {
885                 fn = fib6_backtrack(fn, &fl6->saddr);
886                 if (fn)
887                         goto restart;
888         }
889         dst_use(&rt->dst, jiffies);
890         read_unlock_bh(&table->tb6_lock);
891
892         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
893
894         return rt;
895
896 }
897
898 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
899                                     int flags)
900 {
901         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
902 }
903 EXPORT_SYMBOL_GPL(ip6_route_lookup);
904
905 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
906                             const struct in6_addr *saddr, int oif, int strict)
907 {
908         struct flowi6 fl6 = {
909                 .flowi6_oif = oif,
910                 .daddr = *daddr,
911         };
912         struct dst_entry *dst;
913         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
914
915         if (saddr) {
916                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
917                 flags |= RT6_LOOKUP_F_HAS_SADDR;
918         }
919
920         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
921         if (dst->error == 0)
922                 return (struct rt6_info *) dst;
923
924         dst_release(dst);
925
926         return NULL;
927 }
928 EXPORT_SYMBOL(rt6_lookup);
929
930 /* ip6_ins_rt is called with FREE table->tb6_lock.
931  * It takes new route entry, the addition fails by any reason the
932  * route is released.
933  * Caller must hold dst before calling it.
934  */
935
936 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
937                         struct mx6_config *mxc,
938                         struct netlink_ext_ack *extack)
939 {
940         int err;
941         struct fib6_table *table;
942
943         table = rt->rt6i_table;
944         write_lock_bh(&table->tb6_lock);
945         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
946         write_unlock_bh(&table->tb6_lock);
947
948         return err;
949 }
950
951 int ip6_ins_rt(struct rt6_info *rt)
952 {
953         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
954         struct mx6_config mxc = { .mx = NULL, };
955
956         /* Hold dst to account for the reference from the fib6 tree */
957         dst_hold(&rt->dst);
958         return __ip6_ins_rt(rt, &info, &mxc, NULL);
959 }
960
961 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
962                                            const struct in6_addr *daddr,
963                                            const struct in6_addr *saddr)
964 {
965         struct rt6_info *rt;
966
967         /*
968          *      Clone the route.
969          */
970
971         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
972                 ort = (struct rt6_info *)ort->dst.from;
973
974         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
975
976         if (!rt)
977                 return NULL;
978
979         ip6_rt_copy_init(rt, ort);
980         rt->rt6i_flags |= RTF_CACHE;
981         rt->rt6i_metric = 0;
982         rt->dst.flags |= DST_HOST;
983         rt->rt6i_dst.addr = *daddr;
984         rt->rt6i_dst.plen = 128;
985
986         if (!rt6_is_gw_or_nonexthop(ort)) {
987                 if (ort->rt6i_dst.plen != 128 &&
988                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
989                         rt->rt6i_flags |= RTF_ANYCAST;
990 #ifdef CONFIG_IPV6_SUBTREES
991                 if (rt->rt6i_src.plen && saddr) {
992                         rt->rt6i_src.addr = *saddr;
993                         rt->rt6i_src.plen = 128;
994                 }
995 #endif
996         }
997
998         return rt;
999 }
1000
1001 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1002 {
1003         struct rt6_info *pcpu_rt;
1004
1005         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1006                                   rt->dst.dev, rt->dst.flags);
1007
1008         if (!pcpu_rt)
1009                 return NULL;
1010         ip6_rt_copy_init(pcpu_rt, rt);
1011         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1012         pcpu_rt->rt6i_flags |= RTF_PCPU;
1013         return pcpu_rt;
1014 }
1015
1016 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1017 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1018 {
1019         struct rt6_info *pcpu_rt, **p;
1020
1021         p = this_cpu_ptr(rt->rt6i_pcpu);
1022         pcpu_rt = *p;
1023
1024         if (pcpu_rt) {
1025                 dst_hold(&pcpu_rt->dst);
1026                 rt6_dst_from_metrics_check(pcpu_rt);
1027         }
1028         return pcpu_rt;
1029 }
1030
1031 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1032 {
1033         struct fib6_table *table = rt->rt6i_table;
1034         struct rt6_info *pcpu_rt, *prev, **p;
1035
1036         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1037         if (!pcpu_rt) {
1038                 struct net *net = dev_net(rt->dst.dev);
1039
1040                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1041                 return net->ipv6.ip6_null_entry;
1042         }
1043
1044         read_lock_bh(&table->tb6_lock);
1045         if (rt->rt6i_pcpu) {
1046                 p = this_cpu_ptr(rt->rt6i_pcpu);
1047                 prev = cmpxchg(p, NULL, pcpu_rt);
1048                 if (prev) {
1049                         /* If someone did it before us, return prev instead */
1050                         dst_release_immediate(&pcpu_rt->dst);
1051                         pcpu_rt = prev;
1052                 }
1053         } else {
1054                 /* rt has been removed from the fib6 tree
1055                  * before we have a chance to acquire the read_lock.
1056                  * In this case, don't brother to create a pcpu rt
1057                  * since rt is going away anyway.  The next
1058                  * dst_check() will trigger a re-lookup.
1059                  */
1060                 dst_release_immediate(&pcpu_rt->dst);
1061                 pcpu_rt = rt;
1062         }
1063         dst_hold(&pcpu_rt->dst);
1064         rt6_dst_from_metrics_check(pcpu_rt);
1065         read_unlock_bh(&table->tb6_lock);
1066         return pcpu_rt;
1067 }
1068
1069 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1070                                int oif, struct flowi6 *fl6, int flags)
1071 {
1072         struct fib6_node *fn, *saved_fn;
1073         struct rt6_info *rt;
1074         int strict = 0;
1075
1076         strict |= flags & RT6_LOOKUP_F_IFACE;
1077         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1078         if (net->ipv6.devconf_all->forwarding == 0)
1079                 strict |= RT6_LOOKUP_F_REACHABLE;
1080
1081         read_lock_bh(&table->tb6_lock);
1082
1083         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1084         saved_fn = fn;
1085
1086         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1087                 oif = 0;
1088
1089 redo_rt6_select:
1090         rt = rt6_select(fn, oif, strict);
1091         if (rt->rt6i_nsiblings)
1092                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1093         if (rt == net->ipv6.ip6_null_entry) {
1094                 fn = fib6_backtrack(fn, &fl6->saddr);
1095                 if (fn)
1096                         goto redo_rt6_select;
1097                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1098                         /* also consider unreachable route */
1099                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1100                         fn = saved_fn;
1101                         goto redo_rt6_select;
1102                 }
1103         }
1104
1105
1106         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1107                 dst_use(&rt->dst, jiffies);
1108                 read_unlock_bh(&table->tb6_lock);
1109
1110                 rt6_dst_from_metrics_check(rt);
1111
1112                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1113                 return rt;
1114         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1115                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1116                 /* Create a RTF_CACHE clone which will not be
1117                  * owned by the fib6 tree.  It is for the special case where
1118                  * the daddr in the skb during the neighbor look-up is different
1119                  * from the fl6->daddr used to look-up route here.
1120                  */
1121
1122                 struct rt6_info *uncached_rt;
1123
1124                 dst_use(&rt->dst, jiffies);
1125                 read_unlock_bh(&table->tb6_lock);
1126
1127                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1128                 dst_release(&rt->dst);
1129
1130                 if (uncached_rt) {
1131                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1132                          * No need for another dst_hold()
1133                          */
1134                         rt6_uncached_list_add(uncached_rt);
1135                 } else {
1136                         uncached_rt = net->ipv6.ip6_null_entry;
1137                         dst_hold(&uncached_rt->dst);
1138                 }
1139
1140                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1141                 return uncached_rt;
1142
1143         } else {
1144                 /* Get a percpu copy */
1145
1146                 struct rt6_info *pcpu_rt;
1147
1148                 rt->dst.lastuse = jiffies;
1149                 rt->dst.__use++;
1150                 pcpu_rt = rt6_get_pcpu_route(rt);
1151
1152                 if (pcpu_rt) {
1153                         read_unlock_bh(&table->tb6_lock);
1154                 } else {
1155                         /* We have to do the read_unlock first
1156                          * because rt6_make_pcpu_route() may trigger
1157                          * ip6_dst_gc() which will take the write_lock.
1158                          */
1159                         dst_hold(&rt->dst);
1160                         read_unlock_bh(&table->tb6_lock);
1161                         pcpu_rt = rt6_make_pcpu_route(rt);
1162                         dst_release(&rt->dst);
1163                 }
1164
1165                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1166                 return pcpu_rt;
1167
1168         }
1169 }
1170 EXPORT_SYMBOL_GPL(ip6_pol_route);
1171
1172 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1173                                             struct flowi6 *fl6, int flags)
1174 {
1175         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1176 }
1177
1178 struct dst_entry *ip6_route_input_lookup(struct net *net,
1179                                          struct net_device *dev,
1180                                          struct flowi6 *fl6, int flags)
1181 {
1182         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1183                 flags |= RT6_LOOKUP_F_IFACE;
1184
1185         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1186 }
1187 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1188
1189 void ip6_route_input(struct sk_buff *skb)
1190 {
1191         const struct ipv6hdr *iph = ipv6_hdr(skb);
1192         struct net *net = dev_net(skb->dev);
1193         int flags = RT6_LOOKUP_F_HAS_SADDR;
1194         struct ip_tunnel_info *tun_info;
1195         struct flowi6 fl6 = {
1196                 .flowi6_iif = skb->dev->ifindex,
1197                 .daddr = iph->daddr,
1198                 .saddr = iph->saddr,
1199                 .flowlabel = ip6_flowinfo(iph),
1200                 .flowi6_mark = skb->mark,
1201                 .flowi6_proto = iph->nexthdr,
1202         };
1203
1204         tun_info = skb_tunnel_info(skb);
1205         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1206                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1207         skb_dst_drop(skb);
1208         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1209 }
1210
1211 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1212                                              struct flowi6 *fl6, int flags)
1213 {
1214         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1215 }
1216
1217 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1218                                          struct flowi6 *fl6, int flags)
1219 {
1220         bool any_src;
1221
1222         if (rt6_need_strict(&fl6->daddr)) {
1223                 struct dst_entry *dst;
1224
1225                 dst = l3mdev_link_scope_lookup(net, fl6);
1226                 if (dst)
1227                         return dst;
1228         }
1229
1230         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1231
1232         any_src = ipv6_addr_any(&fl6->saddr);
1233         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1234             (fl6->flowi6_oif && any_src))
1235                 flags |= RT6_LOOKUP_F_IFACE;
1236
1237         if (!any_src)
1238                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1239         else if (sk)
1240                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1241
1242         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1243 }
1244 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1245
1246 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1247 {
1248         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1249         struct net_device *loopback_dev = net->loopback_dev;
1250         struct dst_entry *new = NULL;
1251
1252         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1253                        DST_OBSOLETE_NONE, 0);
1254         if (rt) {
1255                 rt6_info_init(rt);
1256
1257                 new = &rt->dst;
1258                 new->__use = 1;
1259                 new->input = dst_discard;
1260                 new->output = dst_discard_out;
1261
1262                 dst_copy_metrics(new, &ort->dst);
1263
1264                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1265                 rt->rt6i_gateway = ort->rt6i_gateway;
1266                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1267                 rt->rt6i_metric = 0;
1268
1269                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1270 #ifdef CONFIG_IPV6_SUBTREES
1271                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1272 #endif
1273         }
1274
1275         dst_release(dst_orig);
1276         return new ? new : ERR_PTR(-ENOMEM);
1277 }
1278
1279 /*
1280  *      Destination cache support functions
1281  */
1282
1283 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1284 {
1285         if (rt->dst.from &&
1286             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1287                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1288 }
1289
1290 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1291 {
1292         u32 rt_cookie = 0;
1293
1294         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1295                 return NULL;
1296
1297         if (rt6_check_expired(rt))
1298                 return NULL;
1299
1300         return &rt->dst;
1301 }
1302
1303 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1304 {
1305         if (!__rt6_check_expired(rt) &&
1306             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1307             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1308                 return &rt->dst;
1309         else
1310                 return NULL;
1311 }
1312
1313 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1314 {
1315         struct rt6_info *rt;
1316
1317         rt = (struct rt6_info *) dst;
1318
1319         /* All IPV6 dsts are created with ->obsolete set to the value
1320          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1321          * into this function always.
1322          */
1323
1324         rt6_dst_from_metrics_check(rt);
1325
1326         if (rt->rt6i_flags & RTF_PCPU ||
1327             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1328                 return rt6_dst_from_check(rt, cookie);
1329         else
1330                 return rt6_check(rt, cookie);
1331 }
1332
1333 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1334 {
1335         struct rt6_info *rt = (struct rt6_info *) dst;
1336
1337         if (rt) {
1338                 if (rt->rt6i_flags & RTF_CACHE) {
1339                         if (rt6_check_expired(rt)) {
1340                                 ip6_del_rt(rt);
1341                                 dst = NULL;
1342                         }
1343                 } else {
1344                         dst_release(dst);
1345                         dst = NULL;
1346                 }
1347         }
1348         return dst;
1349 }
1350
1351 static void ip6_link_failure(struct sk_buff *skb)
1352 {
1353         struct rt6_info *rt;
1354
1355         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1356
1357         rt = (struct rt6_info *) skb_dst(skb);
1358         if (rt) {
1359                 if (rt->rt6i_flags & RTF_CACHE) {
1360                         if (dst_hold_safe(&rt->dst))
1361                                 ip6_del_rt(rt);
1362                 } else {
1363                         struct fib6_node *fn;
1364
1365                         rcu_read_lock();
1366                         fn = rcu_dereference(rt->rt6i_node);
1367                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1368                                 fn->fn_sernum = -1;
1369                         rcu_read_unlock();
1370                 }
1371         }
1372 }
1373
1374 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1375 {
1376         struct net *net = dev_net(rt->dst.dev);
1377
1378         rt->rt6i_flags |= RTF_MODIFIED;
1379         rt->rt6i_pmtu = mtu;
1380         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1381 }
1382
1383 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1384 {
1385         return !(rt->rt6i_flags & RTF_CACHE) &&
1386                 (rt->rt6i_flags & RTF_PCPU ||
1387                  rcu_access_pointer(rt->rt6i_node));
1388 }
1389
1390 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1391                                  const struct ipv6hdr *iph, u32 mtu)
1392 {
1393         const struct in6_addr *daddr, *saddr;
1394         struct rt6_info *rt6 = (struct rt6_info *)dst;
1395
1396         if (rt6->rt6i_flags & RTF_LOCAL)
1397                 return;
1398
1399         if (dst_metric_locked(dst, RTAX_MTU))
1400                 return;
1401
1402         if (iph) {
1403                 daddr = &iph->daddr;
1404                 saddr = &iph->saddr;
1405         } else if (sk) {
1406                 daddr = &sk->sk_v6_daddr;
1407                 saddr = &inet6_sk(sk)->saddr;
1408         } else {
1409                 daddr = NULL;
1410                 saddr = NULL;
1411         }
1412         dst_confirm_neigh(dst, daddr);
1413         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1414         if (mtu >= dst_mtu(dst))
1415                 return;
1416
1417         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1418                 rt6_do_update_pmtu(rt6, mtu);
1419         } else if (daddr) {
1420                 struct rt6_info *nrt6;
1421
1422                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1423                 if (nrt6) {
1424                         rt6_do_update_pmtu(nrt6, mtu);
1425
1426                         /* ip6_ins_rt(nrt6) will bump the
1427                          * rt6->rt6i_node->fn_sernum
1428                          * which will fail the next rt6_check() and
1429                          * invalidate the sk->sk_dst_cache.
1430                          */
1431                         ip6_ins_rt(nrt6);
1432                         /* Release the reference taken in
1433                          * ip6_rt_cache_alloc()
1434                          */
1435                         dst_release(&nrt6->dst);
1436                 }
1437         }
1438 }
1439
1440 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1441                                struct sk_buff *skb, u32 mtu)
1442 {
1443         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1444 }
1445
1446 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1447                      int oif, u32 mark, kuid_t uid)
1448 {
1449         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1450         struct dst_entry *dst;
1451         struct flowi6 fl6;
1452
1453         memset(&fl6, 0, sizeof(fl6));
1454         fl6.flowi6_oif = oif;
1455         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1456         fl6.daddr = iph->daddr;
1457         fl6.saddr = iph->saddr;
1458         fl6.flowlabel = ip6_flowinfo(iph);
1459         fl6.flowi6_uid = uid;
1460
1461         dst = ip6_route_output(net, NULL, &fl6);
1462         if (!dst->error)
1463                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1464         dst_release(dst);
1465 }
1466 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1467
1468 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1469 {
1470         struct dst_entry *dst;
1471
1472         ip6_update_pmtu(skb, sock_net(sk), mtu,
1473                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1474
1475         dst = __sk_dst_get(sk);
1476         if (!dst || !dst->obsolete ||
1477             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1478                 return;
1479
1480         bh_lock_sock(sk);
1481         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1482                 ip6_datagram_dst_update(sk, false);
1483         bh_unlock_sock(sk);
1484 }
1485 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1486
1487 /* Handle redirects */
1488 struct ip6rd_flowi {
1489         struct flowi6 fl6;
1490         struct in6_addr gateway;
1491 };
1492
1493 static struct rt6_info *__ip6_route_redirect(struct net *net,
1494                                              struct fib6_table *table,
1495                                              struct flowi6 *fl6,
1496                                              int flags)
1497 {
1498         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1499         struct rt6_info *rt;
1500         struct fib6_node *fn;
1501
1502         /* Get the "current" route for this destination and
1503          * check if the redirect has come from appropriate router.
1504          *
1505          * RFC 4861 specifies that redirects should only be
1506          * accepted if they come from the nexthop to the target.
1507          * Due to the way the routes are chosen, this notion
1508          * is a bit fuzzy and one might need to check all possible
1509          * routes.
1510          */
1511
1512         read_lock_bh(&table->tb6_lock);
1513         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1514 restart:
1515         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1516                 if (rt6_check_expired(rt))
1517                         continue;
1518                 if (rt->dst.error)
1519                         break;
1520                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1521                         continue;
1522                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1523                         continue;
1524                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1525                         continue;
1526                 break;
1527         }
1528
1529         if (!rt)
1530                 rt = net->ipv6.ip6_null_entry;
1531         else if (rt->dst.error) {
1532                 rt = net->ipv6.ip6_null_entry;
1533                 goto out;
1534         }
1535
1536         if (rt == net->ipv6.ip6_null_entry) {
1537                 fn = fib6_backtrack(fn, &fl6->saddr);
1538                 if (fn)
1539                         goto restart;
1540         }
1541
1542 out:
1543         dst_hold(&rt->dst);
1544
1545         read_unlock_bh(&table->tb6_lock);
1546
1547         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1548         return rt;
1549 };
1550
1551 static struct dst_entry *ip6_route_redirect(struct net *net,
1552                                         const struct flowi6 *fl6,
1553                                         const struct in6_addr *gateway)
1554 {
1555         int flags = RT6_LOOKUP_F_HAS_SADDR;
1556         struct ip6rd_flowi rdfl;
1557
1558         rdfl.fl6 = *fl6;
1559         rdfl.gateway = *gateway;
1560
1561         return fib6_rule_lookup(net, &rdfl.fl6,
1562                                 flags, __ip6_route_redirect);
1563 }
1564
1565 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1566                   kuid_t uid)
1567 {
1568         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1569         struct dst_entry *dst;
1570         struct flowi6 fl6;
1571
1572         memset(&fl6, 0, sizeof(fl6));
1573         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1574         fl6.flowi6_oif = oif;
1575         fl6.flowi6_mark = mark;
1576         fl6.daddr = iph->daddr;
1577         fl6.saddr = iph->saddr;
1578         fl6.flowlabel = ip6_flowinfo(iph);
1579         fl6.flowi6_uid = uid;
1580
1581         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1582         rt6_do_redirect(dst, NULL, skb);
1583         dst_release(dst);
1584 }
1585 EXPORT_SYMBOL_GPL(ip6_redirect);
1586
1587 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1588                             u32 mark)
1589 {
1590         const struct ipv6hdr *iph = ipv6_hdr(skb);
1591         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1592         struct dst_entry *dst;
1593         struct flowi6 fl6;
1594
1595         memset(&fl6, 0, sizeof(fl6));
1596         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1597         fl6.flowi6_oif = oif;
1598         fl6.flowi6_mark = mark;
1599         fl6.daddr = msg->dest;
1600         fl6.saddr = iph->daddr;
1601         fl6.flowi6_uid = sock_net_uid(net, NULL);
1602
1603         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1604         rt6_do_redirect(dst, NULL, skb);
1605         dst_release(dst);
1606 }
1607
1608 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1609 {
1610         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1611                      sk->sk_uid);
1612 }
1613 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1614
1615 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1616 {
1617         struct net_device *dev = dst->dev;
1618         unsigned int mtu = dst_mtu(dst);
1619         struct net *net = dev_net(dev);
1620
1621         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1622
1623         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1624                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1625
1626         /*
1627          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1628          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1629          * IPV6_MAXPLEN is also valid and means: "any MSS,
1630          * rely only on pmtu discovery"
1631          */
1632         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1633                 mtu = IPV6_MAXPLEN;
1634         return mtu;
1635 }
1636
1637 static unsigned int ip6_mtu(const struct dst_entry *dst)
1638 {
1639         const struct rt6_info *rt = (const struct rt6_info *)dst;
1640         unsigned int mtu = rt->rt6i_pmtu;
1641         struct inet6_dev *idev;
1642
1643         if (mtu)
1644                 goto out;
1645
1646         mtu = dst_metric_raw(dst, RTAX_MTU);
1647         if (mtu)
1648                 goto out;
1649
1650         mtu = IPV6_MIN_MTU;
1651
1652         rcu_read_lock();
1653         idev = __in6_dev_get(dst->dev);
1654         if (idev)
1655                 mtu = idev->cnf.mtu6;
1656         rcu_read_unlock();
1657
1658 out:
1659         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1660
1661         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1662 }
1663
1664 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1665                                   struct flowi6 *fl6)
1666 {
1667         struct dst_entry *dst;
1668         struct rt6_info *rt;
1669         struct inet6_dev *idev = in6_dev_get(dev);
1670         struct net *net = dev_net(dev);
1671
1672         if (unlikely(!idev))
1673                 return ERR_PTR(-ENODEV);
1674
1675         rt = ip6_dst_alloc(net, dev, 0);
1676         if (unlikely(!rt)) {
1677                 in6_dev_put(idev);
1678                 dst = ERR_PTR(-ENOMEM);
1679                 goto out;
1680         }
1681
1682         rt->dst.flags |= DST_HOST;
1683         rt->dst.output  = ip6_output;
1684         rt->rt6i_gateway  = fl6->daddr;
1685         rt->rt6i_dst.addr = fl6->daddr;
1686         rt->rt6i_dst.plen = 128;
1687         rt->rt6i_idev     = idev;
1688         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1689
1690         /* Add this dst into uncached_list so that rt6_ifdown() can
1691          * do proper release of the net_device
1692          */
1693         rt6_uncached_list_add(rt);
1694
1695         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1696
1697 out:
1698         return dst;
1699 }
1700
1701 static int ip6_dst_gc(struct dst_ops *ops)
1702 {
1703         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1704         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1705         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1706         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1707         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1708         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1709         int entries;
1710
1711         entries = dst_entries_get_fast(ops);
1712         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1713             entries <= rt_max_size)
1714                 goto out;
1715
1716         net->ipv6.ip6_rt_gc_expire++;
1717         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1718         entries = dst_entries_get_slow(ops);
1719         if (entries < ops->gc_thresh)
1720                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1721 out:
1722         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1723         return entries > rt_max_size;
1724 }
1725
1726 static int ip6_convert_metrics(struct mx6_config *mxc,
1727                                const struct fib6_config *cfg)
1728 {
1729         bool ecn_ca = false;
1730         struct nlattr *nla;
1731         int remaining;
1732         u32 *mp;
1733
1734         if (!cfg->fc_mx)
1735                 return 0;
1736
1737         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1738         if (unlikely(!mp))
1739                 return -ENOMEM;
1740
1741         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1742                 int type = nla_type(nla);
1743                 u32 val;
1744
1745                 if (!type)
1746                         continue;
1747                 if (unlikely(type > RTAX_MAX))
1748                         goto err;
1749
1750                 if (type == RTAX_CC_ALGO) {
1751                         char tmp[TCP_CA_NAME_MAX];
1752
1753                         nla_strlcpy(tmp, nla, sizeof(tmp));
1754                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1755                         if (val == TCP_CA_UNSPEC)
1756                                 goto err;
1757                 } else {
1758                         val = nla_get_u32(nla);
1759                 }
1760                 if (type == RTAX_HOPLIMIT && val > 255)
1761                         val = 255;
1762                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1763                         goto err;
1764
1765                 mp[type - 1] = val;
1766                 __set_bit(type - 1, mxc->mx_valid);
1767         }
1768
1769         if (ecn_ca) {
1770                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1771                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1772         }
1773
1774         mxc->mx = mp;
1775         return 0;
1776  err:
1777         kfree(mp);
1778         return -EINVAL;
1779 }
1780
1781 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1782                                             struct fib6_config *cfg,
1783                                             const struct in6_addr *gw_addr)
1784 {
1785         struct flowi6 fl6 = {
1786                 .flowi6_oif = cfg->fc_ifindex,
1787                 .daddr = *gw_addr,
1788                 .saddr = cfg->fc_prefsrc,
1789         };
1790         struct fib6_table *table;
1791         struct rt6_info *rt;
1792         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1793
1794         table = fib6_get_table(net, cfg->fc_table);
1795         if (!table)
1796                 return NULL;
1797
1798         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1799                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1800
1801         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1802
1803         /* if table lookup failed, fall back to full lookup */
1804         if (rt == net->ipv6.ip6_null_entry) {
1805                 ip6_rt_put(rt);
1806                 rt = NULL;
1807         }
1808
1809         return rt;
1810 }
1811
1812 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1813                                               struct netlink_ext_ack *extack)
1814 {
1815         struct net *net = cfg->fc_nlinfo.nl_net;
1816         struct rt6_info *rt = NULL;
1817         struct net_device *dev = NULL;
1818         struct inet6_dev *idev = NULL;
1819         struct fib6_table *table;
1820         int addr_type;
1821         int err = -EINVAL;
1822
1823         /* RTF_PCPU is an internal flag; can not be set by userspace */
1824         if (cfg->fc_flags & RTF_PCPU) {
1825                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1826                 goto out;
1827         }
1828
1829         if (cfg->fc_dst_len > 128) {
1830                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1831                 goto out;
1832         }
1833         if (cfg->fc_src_len > 128) {
1834                 NL_SET_ERR_MSG(extack, "Invalid source address length");
1835                 goto out;
1836         }
1837 #ifndef CONFIG_IPV6_SUBTREES
1838         if (cfg->fc_src_len) {
1839                 NL_SET_ERR_MSG(extack,
1840                                "Specifying source address requires IPV6_SUBTREES to be enabled");
1841                 goto out;
1842         }
1843 #endif
1844         if (cfg->fc_ifindex) {
1845                 err = -ENODEV;
1846                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1847                 if (!dev)
1848                         goto out;
1849                 idev = in6_dev_get(dev);
1850                 if (!idev)
1851                         goto out;
1852         }
1853
1854         if (cfg->fc_metric == 0)
1855                 cfg->fc_metric = IP6_RT_PRIO_USER;
1856
1857         err = -ENOBUFS;
1858         if (cfg->fc_nlinfo.nlh &&
1859             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1860                 table = fib6_get_table(net, cfg->fc_table);
1861                 if (!table) {
1862                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1863                         table = fib6_new_table(net, cfg->fc_table);
1864                 }
1865         } else {
1866                 table = fib6_new_table(net, cfg->fc_table);
1867         }
1868
1869         if (!table)
1870                 goto out;
1871
1872         rt = ip6_dst_alloc(net, NULL,
1873                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1874
1875         if (!rt) {
1876                 err = -ENOMEM;
1877                 goto out;
1878         }
1879
1880         if (cfg->fc_flags & RTF_EXPIRES)
1881                 rt6_set_expires(rt, jiffies +
1882                                 clock_t_to_jiffies(cfg->fc_expires));
1883         else
1884                 rt6_clean_expires(rt);
1885
1886         if (cfg->fc_protocol == RTPROT_UNSPEC)
1887                 cfg->fc_protocol = RTPROT_BOOT;
1888         rt->rt6i_protocol = cfg->fc_protocol;
1889
1890         addr_type = ipv6_addr_type(&cfg->fc_dst);
1891
1892         if (addr_type & IPV6_ADDR_MULTICAST)
1893                 rt->dst.input = ip6_mc_input;
1894         else if (cfg->fc_flags & RTF_LOCAL)
1895                 rt->dst.input = ip6_input;
1896         else
1897                 rt->dst.input = ip6_forward;
1898
1899         rt->dst.output = ip6_output;
1900
1901         if (cfg->fc_encap) {
1902                 struct lwtunnel_state *lwtstate;
1903
1904                 err = lwtunnel_build_state(cfg->fc_encap_type,
1905                                            cfg->fc_encap, AF_INET6, cfg,
1906                                            &lwtstate, extack);
1907                 if (err)
1908                         goto out;
1909                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1910                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1911                         rt->dst.lwtstate->orig_output = rt->dst.output;
1912                         rt->dst.output = lwtunnel_output;
1913                 }
1914                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1915                         rt->dst.lwtstate->orig_input = rt->dst.input;
1916                         rt->dst.input = lwtunnel_input;
1917                 }
1918         }
1919
1920         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1921         rt->rt6i_dst.plen = cfg->fc_dst_len;
1922         if (rt->rt6i_dst.plen == 128)
1923                 rt->dst.flags |= DST_HOST;
1924
1925 #ifdef CONFIG_IPV6_SUBTREES
1926         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1927         rt->rt6i_src.plen = cfg->fc_src_len;
1928 #endif
1929
1930         rt->rt6i_metric = cfg->fc_metric;
1931
1932         /* We cannot add true routes via loopback here,
1933            they would result in kernel looping; promote them to reject routes
1934          */
1935         if ((cfg->fc_flags & RTF_REJECT) ||
1936             (dev && (dev->flags & IFF_LOOPBACK) &&
1937              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1938              !(cfg->fc_flags & RTF_LOCAL))) {
1939                 /* hold loopback dev/idev if we haven't done so. */
1940                 if (dev != net->loopback_dev) {
1941                         if (dev) {
1942                                 dev_put(dev);
1943                                 in6_dev_put(idev);
1944                         }
1945                         dev = net->loopback_dev;
1946                         dev_hold(dev);
1947                         idev = in6_dev_get(dev);
1948                         if (!idev) {
1949                                 err = -ENODEV;
1950                                 goto out;
1951                         }
1952                 }
1953                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1954                 switch (cfg->fc_type) {
1955                 case RTN_BLACKHOLE:
1956                         rt->dst.error = -EINVAL;
1957                         rt->dst.output = dst_discard_out;
1958                         rt->dst.input = dst_discard;
1959                         break;
1960                 case RTN_PROHIBIT:
1961                         rt->dst.error = -EACCES;
1962                         rt->dst.output = ip6_pkt_prohibit_out;
1963                         rt->dst.input = ip6_pkt_prohibit;
1964                         break;
1965                 case RTN_THROW:
1966                 case RTN_UNREACHABLE:
1967                 default:
1968                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1969                                         : (cfg->fc_type == RTN_UNREACHABLE)
1970                                         ? -EHOSTUNREACH : -ENETUNREACH;
1971                         rt->dst.output = ip6_pkt_discard_out;
1972                         rt->dst.input = ip6_pkt_discard;
1973                         break;
1974                 }
1975                 goto install_route;
1976         }
1977
1978         if (cfg->fc_flags & RTF_GATEWAY) {
1979                 const struct in6_addr *gw_addr;
1980                 int gwa_type;
1981
1982                 gw_addr = &cfg->fc_gateway;
1983                 gwa_type = ipv6_addr_type(gw_addr);
1984
1985                 /* if gw_addr is local we will fail to detect this in case
1986                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1987                  * will return already-added prefix route via interface that
1988                  * prefix route was assigned to, which might be non-loopback.
1989                  */
1990                 err = -EINVAL;
1991                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1992                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1993                                             dev : NULL, 0, 0)) {
1994                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
1995                         goto out;
1996                 }
1997                 rt->rt6i_gateway = *gw_addr;
1998
1999                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2000                         struct rt6_info *grt = NULL;
2001
2002                         /* IPv6 strictly inhibits using not link-local
2003                            addresses as nexthop address.
2004                            Otherwise, router will not able to send redirects.
2005                            It is very good, but in some (rare!) circumstances
2006                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2007                            some exceptions. --ANK
2008                            We allow IPv4-mapped nexthops to support RFC4798-type
2009                            addressing
2010                          */
2011                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2012                                           IPV6_ADDR_MAPPED))) {
2013                                 NL_SET_ERR_MSG(extack,
2014                                                "Invalid gateway address");
2015                                 goto out;
2016                         }
2017
2018                         if (cfg->fc_table) {
2019                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2020
2021                                 if (grt) {
2022                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2023                                             (dev && dev != grt->dst.dev)) {
2024                                                 ip6_rt_put(grt);
2025                                                 grt = NULL;
2026                                         }
2027                                 }
2028                         }
2029
2030                         if (!grt)
2031                                 grt = rt6_lookup(net, gw_addr, NULL,
2032                                                  cfg->fc_ifindex, 1);
2033
2034                         err = -EHOSTUNREACH;
2035                         if (!grt)
2036                                 goto out;
2037                         if (dev) {
2038                                 if (dev != grt->dst.dev) {
2039                                         ip6_rt_put(grt);
2040                                         goto out;
2041                                 }
2042                         } else {
2043                                 dev = grt->dst.dev;
2044                                 idev = grt->rt6i_idev;
2045                                 dev_hold(dev);
2046                                 in6_dev_hold(grt->rt6i_idev);
2047                         }
2048                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2049                                 err = 0;
2050                         ip6_rt_put(grt);
2051
2052                         if (err)
2053                                 goto out;
2054                 }
2055                 err = -EINVAL;
2056                 if (!dev) {
2057                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2058                         goto out;
2059                 } else if (dev->flags & IFF_LOOPBACK) {
2060                         NL_SET_ERR_MSG(extack,
2061                                        "Egress device can not be loopback device for this route");
2062                         goto out;
2063                 }
2064         }
2065
2066         err = -ENODEV;
2067         if (!dev)
2068                 goto out;
2069
2070         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2071                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2072                         NL_SET_ERR_MSG(extack, "Invalid source address");
2073                         err = -EINVAL;
2074                         goto out;
2075                 }
2076                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2077                 rt->rt6i_prefsrc.plen = 128;
2078         } else
2079                 rt->rt6i_prefsrc.plen = 0;
2080
2081         rt->rt6i_flags = cfg->fc_flags;
2082
2083 install_route:
2084         rt->dst.dev = dev;
2085         rt->rt6i_idev = idev;
2086         rt->rt6i_table = table;
2087
2088         cfg->fc_nlinfo.nl_net = dev_net(dev);
2089
2090         return rt;
2091 out:
2092         if (dev)
2093                 dev_put(dev);
2094         if (idev)
2095                 in6_dev_put(idev);
2096         if (rt)
2097                 dst_release_immediate(&rt->dst);
2098
2099         return ERR_PTR(err);
2100 }
2101
2102 int ip6_route_add(struct fib6_config *cfg,
2103                   struct netlink_ext_ack *extack)
2104 {
2105         struct mx6_config mxc = { .mx = NULL, };
2106         struct rt6_info *rt;
2107         int err;
2108
2109         rt = ip6_route_info_create(cfg, extack);
2110         if (IS_ERR(rt)) {
2111                 err = PTR_ERR(rt);
2112                 rt = NULL;
2113                 goto out;
2114         }
2115
2116         err = ip6_convert_metrics(&mxc, cfg);
2117         if (err)
2118                 goto out;
2119
2120         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2121
2122         kfree(mxc.mx);
2123
2124         return err;
2125 out:
2126         if (rt)
2127                 dst_release_immediate(&rt->dst);
2128
2129         return err;
2130 }
2131
2132 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2133 {
2134         int err;
2135         struct fib6_table *table;
2136         struct net *net = dev_net(rt->dst.dev);
2137
2138         if (rt == net->ipv6.ip6_null_entry) {
2139                 err = -ENOENT;
2140                 goto out;
2141         }
2142
2143         table = rt->rt6i_table;
2144         write_lock_bh(&table->tb6_lock);
2145         err = fib6_del(rt, info);
2146         write_unlock_bh(&table->tb6_lock);
2147
2148 out:
2149         ip6_rt_put(rt);
2150         return err;
2151 }
2152
2153 int ip6_del_rt(struct rt6_info *rt)
2154 {
2155         struct nl_info info = {
2156                 .nl_net = dev_net(rt->dst.dev),
2157         };
2158         return __ip6_del_rt(rt, &info);
2159 }
2160
2161 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2162 {
2163         struct nl_info *info = &cfg->fc_nlinfo;
2164         struct net *net = info->nl_net;
2165         struct sk_buff *skb = NULL;
2166         struct fib6_table *table;
2167         int err = -ENOENT;
2168
2169         if (rt == net->ipv6.ip6_null_entry)
2170                 goto out_put;
2171         table = rt->rt6i_table;
2172         write_lock_bh(&table->tb6_lock);
2173
2174         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2175                 struct rt6_info *sibling, *next_sibling;
2176
2177                 /* prefer to send a single notification with all hops */
2178                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2179                 if (skb) {
2180                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2181
2182                         if (rt6_fill_node(net, skb, rt,
2183                                           NULL, NULL, 0, RTM_DELROUTE,
2184                                           info->portid, seq, 0) < 0) {
2185                                 kfree_skb(skb);
2186                                 skb = NULL;
2187                         } else
2188                                 info->skip_notify = 1;
2189                 }
2190
2191                 list_for_each_entry_safe(sibling, next_sibling,
2192                                          &rt->rt6i_siblings,
2193                                          rt6i_siblings) {
2194                         err = fib6_del(sibling, info);
2195                         if (err)
2196                                 goto out_unlock;
2197                 }
2198         }
2199
2200         err = fib6_del(rt, info);
2201 out_unlock:
2202         write_unlock_bh(&table->tb6_lock);
2203 out_put:
2204         ip6_rt_put(rt);
2205
2206         if (skb) {
2207                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2208                             info->nlh, gfp_any());
2209         }
2210         return err;
2211 }
2212
2213 static int ip6_route_del(struct fib6_config *cfg,
2214                          struct netlink_ext_ack *extack)
2215 {
2216         struct fib6_table *table;
2217         struct fib6_node *fn;
2218         struct rt6_info *rt;
2219         int err = -ESRCH;
2220
2221         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2222         if (!table) {
2223                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2224                 return err;
2225         }
2226
2227         read_lock_bh(&table->tb6_lock);
2228
2229         fn = fib6_locate(&table->tb6_root,
2230                          &cfg->fc_dst, cfg->fc_dst_len,
2231                          &cfg->fc_src, cfg->fc_src_len);
2232
2233         if (fn) {
2234                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2235                         if ((rt->rt6i_flags & RTF_CACHE) &&
2236                             !(cfg->fc_flags & RTF_CACHE))
2237                                 continue;
2238                         if (cfg->fc_ifindex &&
2239                             (!rt->dst.dev ||
2240                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2241                                 continue;
2242                         if (cfg->fc_flags & RTF_GATEWAY &&
2243                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2244                                 continue;
2245                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2246                                 continue;
2247                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2248                                 continue;
2249                         dst_hold(&rt->dst);
2250                         read_unlock_bh(&table->tb6_lock);
2251
2252                         /* if gateway was specified only delete the one hop */
2253                         if (cfg->fc_flags & RTF_GATEWAY)
2254                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2255
2256                         return __ip6_del_rt_siblings(rt, cfg);
2257                 }
2258         }
2259         read_unlock_bh(&table->tb6_lock);
2260
2261         return err;
2262 }
2263
2264 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2265 {
2266         struct netevent_redirect netevent;
2267         struct rt6_info *rt, *nrt = NULL;
2268         struct ndisc_options ndopts;
2269         struct inet6_dev *in6_dev;
2270         struct neighbour *neigh;
2271         struct rd_msg *msg;
2272         int optlen, on_link;
2273         u8 *lladdr;
2274
2275         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2276         optlen -= sizeof(*msg);
2277
2278         if (optlen < 0) {
2279                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2280                 return;
2281         }
2282
2283         msg = (struct rd_msg *)icmp6_hdr(skb);
2284
2285         if (ipv6_addr_is_multicast(&msg->dest)) {
2286                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2287                 return;
2288         }
2289
2290         on_link = 0;
2291         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2292                 on_link = 1;
2293         } else if (ipv6_addr_type(&msg->target) !=
2294                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2295                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2296                 return;
2297         }
2298
2299         in6_dev = __in6_dev_get(skb->dev);
2300         if (!in6_dev)
2301                 return;
2302         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2303                 return;
2304
2305         /* RFC2461 8.1:
2306          *      The IP source address of the Redirect MUST be the same as the current
2307          *      first-hop router for the specified ICMP Destination Address.
2308          */
2309
2310         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2311                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2312                 return;
2313         }
2314
2315         lladdr = NULL;
2316         if (ndopts.nd_opts_tgt_lladdr) {
2317                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2318                                              skb->dev);
2319                 if (!lladdr) {
2320                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2321                         return;
2322                 }
2323         }
2324
2325         rt = (struct rt6_info *) dst;
2326         if (rt->rt6i_flags & RTF_REJECT) {
2327                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2328                 return;
2329         }
2330
2331         /* Redirect received -> path was valid.
2332          * Look, redirects are sent only in response to data packets,
2333          * so that this nexthop apparently is reachable. --ANK
2334          */
2335         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2336
2337         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2338         if (!neigh)
2339                 return;
2340
2341         /*
2342          *      We have finally decided to accept it.
2343          */
2344
2345         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2346                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2347                      NEIGH_UPDATE_F_OVERRIDE|
2348                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2349                                      NEIGH_UPDATE_F_ISROUTER)),
2350                      NDISC_REDIRECT, &ndopts);
2351
2352         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2353         if (!nrt)
2354                 goto out;
2355
2356         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2357         if (on_link)
2358                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2359
2360         nrt->rt6i_protocol = RTPROT_REDIRECT;
2361         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2362
2363         if (ip6_ins_rt(nrt))
2364                 goto out_release;
2365
2366         netevent.old = &rt->dst;
2367         netevent.new = &nrt->dst;
2368         netevent.daddr = &msg->dest;
2369         netevent.neigh = neigh;
2370         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2371
2372         if (rt->rt6i_flags & RTF_CACHE) {
2373                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2374                 ip6_del_rt(rt);
2375         }
2376
2377 out_release:
2378         /* Release the reference taken in
2379          * ip6_rt_cache_alloc()
2380          */
2381         dst_release(&nrt->dst);
2382
2383 out:
2384         neigh_release(neigh);
2385 }
2386
2387 /*
2388  *      Misc support functions
2389  */
2390
2391 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2392 {
2393         BUG_ON(from->dst.from);
2394
2395         rt->rt6i_flags &= ~RTF_EXPIRES;
2396         dst_hold(&from->dst);
2397         rt->dst.from = &from->dst;
2398         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2399 }
2400
2401 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2402 {
2403         rt->dst.input = ort->dst.input;
2404         rt->dst.output = ort->dst.output;
2405         rt->rt6i_dst = ort->rt6i_dst;
2406         rt->dst.error = ort->dst.error;
2407         rt->rt6i_idev = ort->rt6i_idev;
2408         if (rt->rt6i_idev)
2409                 in6_dev_hold(rt->rt6i_idev);
2410         rt->dst.lastuse = jiffies;
2411         rt->rt6i_gateway = ort->rt6i_gateway;
2412         rt->rt6i_flags = ort->rt6i_flags;
2413         rt6_set_from(rt, ort);
2414         rt->rt6i_metric = ort->rt6i_metric;
2415 #ifdef CONFIG_IPV6_SUBTREES
2416         rt->rt6i_src = ort->rt6i_src;
2417 #endif
2418         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2419         rt->rt6i_table = ort->rt6i_table;
2420         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2421 }
2422
2423 #ifdef CONFIG_IPV6_ROUTE_INFO
2424 static struct rt6_info *rt6_get_route_info(struct net *net,
2425                                            const struct in6_addr *prefix, int prefixlen,
2426                                            const struct in6_addr *gwaddr,
2427                                            struct net_device *dev)
2428 {
2429         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2430         int ifindex = dev->ifindex;
2431         struct fib6_node *fn;
2432         struct rt6_info *rt = NULL;
2433         struct fib6_table *table;
2434
2435         table = fib6_get_table(net, tb_id);
2436         if (!table)
2437                 return NULL;
2438
2439         read_lock_bh(&table->tb6_lock);
2440         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2441         if (!fn)
2442                 goto out;
2443
2444         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2445                 if (rt->dst.dev->ifindex != ifindex)
2446                         continue;
2447                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2448                         continue;
2449                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2450                         continue;
2451                 dst_hold(&rt->dst);
2452                 break;
2453         }
2454 out:
2455         read_unlock_bh(&table->tb6_lock);
2456         return rt;
2457 }
2458
2459 static struct rt6_info *rt6_add_route_info(struct net *net,
2460                                            const struct in6_addr *prefix, int prefixlen,
2461                                            const struct in6_addr *gwaddr,
2462                                            struct net_device *dev,
2463                                            unsigned int pref)
2464 {
2465         struct fib6_config cfg = {
2466                 .fc_metric      = IP6_RT_PRIO_USER,
2467                 .fc_ifindex     = dev->ifindex,
2468                 .fc_dst_len     = prefixlen,
2469                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2470                                   RTF_UP | RTF_PREF(pref),
2471                 .fc_protocol = RTPROT_RA,
2472                 .fc_nlinfo.portid = 0,
2473                 .fc_nlinfo.nlh = NULL,
2474                 .fc_nlinfo.nl_net = net,
2475         };
2476
2477         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2478         cfg.fc_dst = *prefix;
2479         cfg.fc_gateway = *gwaddr;
2480
2481         /* We should treat it as a default route if prefix length is 0. */
2482         if (!prefixlen)
2483                 cfg.fc_flags |= RTF_DEFAULT;
2484
2485         ip6_route_add(&cfg, NULL);
2486
2487         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2488 }
2489 #endif
2490
2491 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2492 {
2493         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2494         struct rt6_info *rt;
2495         struct fib6_table *table;
2496
2497         table = fib6_get_table(dev_net(dev), tb_id);
2498         if (!table)
2499                 return NULL;
2500
2501         read_lock_bh(&table->tb6_lock);
2502         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2503                 if (dev == rt->dst.dev &&
2504                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2505                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2506                         break;
2507         }
2508         if (rt)
2509                 dst_hold(&rt->dst);
2510         read_unlock_bh(&table->tb6_lock);
2511         return rt;
2512 }
2513
2514 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2515                                      struct net_device *dev,
2516                                      unsigned int pref)
2517 {
2518         struct fib6_config cfg = {
2519                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2520                 .fc_metric      = IP6_RT_PRIO_USER,
2521                 .fc_ifindex     = dev->ifindex,
2522                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2523                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2524                 .fc_protocol = RTPROT_RA,
2525                 .fc_nlinfo.portid = 0,
2526                 .fc_nlinfo.nlh = NULL,
2527                 .fc_nlinfo.nl_net = dev_net(dev),
2528         };
2529
2530         cfg.fc_gateway = *gwaddr;
2531
2532         if (!ip6_route_add(&cfg, NULL)) {
2533                 struct fib6_table *table;
2534
2535                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2536                 if (table)
2537                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2538         }
2539
2540         return rt6_get_dflt_router(gwaddr, dev);
2541 }
2542
2543 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2544 {
2545         struct rt6_info *rt;
2546
2547 restart:
2548         read_lock_bh(&table->tb6_lock);
2549         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2550                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2551                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2552                         dst_hold(&rt->dst);
2553                         read_unlock_bh(&table->tb6_lock);
2554                         ip6_del_rt(rt);
2555                         goto restart;
2556                 }
2557         }
2558         read_unlock_bh(&table->tb6_lock);
2559
2560         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2561 }
2562
2563 void rt6_purge_dflt_routers(struct net *net)
2564 {
2565         struct fib6_table *table;
2566         struct hlist_head *head;
2567         unsigned int h;
2568
2569         rcu_read_lock();
2570
2571         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2572                 head = &net->ipv6.fib_table_hash[h];
2573                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2574                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2575                                 __rt6_purge_dflt_routers(table);
2576                 }
2577         }
2578
2579         rcu_read_unlock();
2580 }
2581
2582 static void rtmsg_to_fib6_config(struct net *net,
2583                                  struct in6_rtmsg *rtmsg,
2584                                  struct fib6_config *cfg)
2585 {
2586         memset(cfg, 0, sizeof(*cfg));
2587
2588         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2589                          : RT6_TABLE_MAIN;
2590         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2591         cfg->fc_metric = rtmsg->rtmsg_metric;
2592         cfg->fc_expires = rtmsg->rtmsg_info;
2593         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2594         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2595         cfg->fc_flags = rtmsg->rtmsg_flags;
2596
2597         cfg->fc_nlinfo.nl_net = net;
2598
2599         cfg->fc_dst = rtmsg->rtmsg_dst;
2600         cfg->fc_src = rtmsg->rtmsg_src;
2601         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2602 }
2603
2604 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2605 {
2606         struct fib6_config cfg;
2607         struct in6_rtmsg rtmsg;
2608         int err;
2609
2610         switch (cmd) {
2611         case SIOCADDRT:         /* Add a route */
2612         case SIOCDELRT:         /* Delete a route */
2613                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2614                         return -EPERM;
2615                 err = copy_from_user(&rtmsg, arg,
2616                                      sizeof(struct in6_rtmsg));
2617                 if (err)
2618                         return -EFAULT;
2619
2620                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2621
2622                 rtnl_lock();
2623                 switch (cmd) {
2624                 case SIOCADDRT:
2625                         err = ip6_route_add(&cfg, NULL);
2626                         break;
2627                 case SIOCDELRT:
2628                         err = ip6_route_del(&cfg, NULL);
2629                         break;
2630                 default:
2631                         err = -EINVAL;
2632                 }
2633                 rtnl_unlock();
2634
2635                 return err;
2636         }
2637
2638         return -EINVAL;
2639 }
2640
2641 /*
2642  *      Drop the packet on the floor
2643  */
2644
2645 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2646 {
2647         int type;
2648         struct dst_entry *dst = skb_dst(skb);
2649         switch (ipstats_mib_noroutes) {
2650         case IPSTATS_MIB_INNOROUTES:
2651                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2652                 if (type == IPV6_ADDR_ANY) {
2653                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2654                                       IPSTATS_MIB_INADDRERRORS);
2655                         break;
2656                 }
2657                 /* FALLTHROUGH */
2658         case IPSTATS_MIB_OUTNOROUTES:
2659                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2660                               ipstats_mib_noroutes);
2661                 break;
2662         }
2663         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2664         kfree_skb(skb);
2665         return 0;
2666 }
2667
2668 static int ip6_pkt_discard(struct sk_buff *skb)
2669 {
2670         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2671 }
2672
2673 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2674 {
2675         skb->dev = skb_dst(skb)->dev;
2676         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2677 }
2678
2679 static int ip6_pkt_prohibit(struct sk_buff *skb)
2680 {
2681         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2682 }
2683
2684 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2685 {
2686         skb->dev = skb_dst(skb)->dev;
2687         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2688 }
2689
2690 /*
2691  *      Allocate a dst for local (unicast / anycast) address.
2692  */
2693
2694 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2695                                     const struct in6_addr *addr,
2696                                     bool anycast)
2697 {
2698         u32 tb_id;
2699         struct net *net = dev_net(idev->dev);
2700         struct net_device *dev = net->loopback_dev;
2701         struct rt6_info *rt;
2702
2703         /* use L3 Master device as loopback for host routes if device
2704          * is enslaved and address is not link local or multicast
2705          */
2706         if (!rt6_need_strict(addr))
2707                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2708
2709         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2710         if (!rt)
2711                 return ERR_PTR(-ENOMEM);
2712
2713         in6_dev_hold(idev);
2714
2715         rt->dst.flags |= DST_HOST;
2716         rt->dst.input = ip6_input;
2717         rt->dst.output = ip6_output;
2718         rt->rt6i_idev = idev;
2719
2720         rt->rt6i_protocol = RTPROT_KERNEL;
2721         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2722         if (anycast)
2723                 rt->rt6i_flags |= RTF_ANYCAST;
2724         else
2725                 rt->rt6i_flags |= RTF_LOCAL;
2726
2727         rt->rt6i_gateway  = *addr;
2728         rt->rt6i_dst.addr = *addr;
2729         rt->rt6i_dst.plen = 128;
2730         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2731         rt->rt6i_table = fib6_get_table(net, tb_id);
2732
2733         return rt;
2734 }
2735
2736 /* remove deleted ip from prefsrc entries */
2737 struct arg_dev_net_ip {
2738         struct net_device *dev;
2739         struct net *net;
2740         struct in6_addr *addr;
2741 };
2742
2743 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2744 {
2745         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2746         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2747         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2748
2749         if (((void *)rt->dst.dev == dev || !dev) &&
2750             rt != net->ipv6.ip6_null_entry &&
2751             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2752                 /* remove prefsrc entry */
2753                 rt->rt6i_prefsrc.plen = 0;
2754         }
2755         return 0;
2756 }
2757
2758 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2759 {
2760         struct net *net = dev_net(ifp->idev->dev);
2761         struct arg_dev_net_ip adni = {
2762                 .dev = ifp->idev->dev,
2763                 .net = net,
2764                 .addr = &ifp->addr,
2765         };
2766         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2767 }
2768
2769 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2770 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2771
2772 /* Remove routers and update dst entries when gateway turn into host. */
2773 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2774 {
2775         struct in6_addr *gateway = (struct in6_addr *)arg;
2776
2777         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2778              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2779              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2780                 return -1;
2781         }
2782         return 0;
2783 }
2784
2785 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2786 {
2787         fib6_clean_all(net, fib6_clean_tohost, gateway);
2788 }
2789
2790 struct arg_dev_net {
2791         struct net_device *dev;
2792         struct net *net;
2793 };
2794
2795 /* called with write lock held for table with rt */
2796 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2797 {
2798         const struct arg_dev_net *adn = arg;
2799         const struct net_device *dev = adn->dev;
2800
2801         if ((rt->dst.dev == dev || !dev) &&
2802             rt != adn->net->ipv6.ip6_null_entry &&
2803             (rt->rt6i_nsiblings == 0 ||
2804              (dev && netdev_unregistering(dev)) ||
2805              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2806                 return -1;
2807
2808         return 0;
2809 }
2810
2811 void rt6_ifdown(struct net *net, struct net_device *dev)
2812 {
2813         struct arg_dev_net adn = {
2814                 .dev = dev,
2815                 .net = net,
2816         };
2817
2818         fib6_clean_all(net, fib6_ifdown, &adn);
2819         if (dev)
2820                 rt6_uncached_list_flush_dev(net, dev);
2821 }
2822
2823 struct rt6_mtu_change_arg {
2824         struct net_device *dev;
2825         unsigned int mtu;
2826 };
2827
2828 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2829 {
2830         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2831         struct inet6_dev *idev;
2832
2833         /* In IPv6 pmtu discovery is not optional,
2834            so that RTAX_MTU lock cannot disable it.
2835            We still use this lock to block changes
2836            caused by addrconf/ndisc.
2837         */
2838
2839         idev = __in6_dev_get(arg->dev);
2840         if (!idev)
2841                 return 0;
2842
2843         /* For administrative MTU increase, there is no way to discover
2844            IPv6 PMTU increase, so PMTU increase should be updated here.
2845            Since RFC 1981 doesn't include administrative MTU increase
2846            update PMTU increase is a MUST. (i.e. jumbo frame)
2847          */
2848         /*
2849            If new MTU is less than route PMTU, this new MTU will be the
2850            lowest MTU in the path, update the route PMTU to reflect PMTU
2851            decreases; if new MTU is greater than route PMTU, and the
2852            old MTU is the lowest MTU in the path, update the route PMTU
2853            to reflect the increase. In this case if the other nodes' MTU
2854            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2855            PMTU discovery.
2856          */
2857         if (rt->dst.dev == arg->dev &&
2858             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2859             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2860                 if (rt->rt6i_flags & RTF_CACHE) {
2861                         /* For RTF_CACHE with rt6i_pmtu == 0
2862                          * (i.e. a redirected route),
2863                          * the metrics of its rt->dst.from has already
2864                          * been updated.
2865                          */
2866                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2867                                 rt->rt6i_pmtu = arg->mtu;
2868                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2869                            (dst_mtu(&rt->dst) < arg->mtu &&
2870                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2871                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2872                 }
2873         }
2874         return 0;
2875 }
2876
2877 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2878 {
2879         struct rt6_mtu_change_arg arg = {
2880                 .dev = dev,
2881                 .mtu = mtu,
2882         };
2883
2884         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2885 }
2886
2887 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2888         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2889         [RTA_OIF]               = { .type = NLA_U32 },
2890         [RTA_IIF]               = { .type = NLA_U32 },
2891         [RTA_PRIORITY]          = { .type = NLA_U32 },
2892         [RTA_METRICS]           = { .type = NLA_NESTED },
2893         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2894         [RTA_PREF]              = { .type = NLA_U8 },
2895         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2896         [RTA_ENCAP]             = { .type = NLA_NESTED },
2897         [RTA_EXPIRES]           = { .type = NLA_U32 },
2898         [RTA_UID]               = { .type = NLA_U32 },
2899         [RTA_MARK]              = { .type = NLA_U32 },
2900 };
2901
2902 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2903                               struct fib6_config *cfg,
2904                               struct netlink_ext_ack *extack)
2905 {
2906         struct rtmsg *rtm;
2907         struct nlattr *tb[RTA_MAX+1];
2908         unsigned int pref;
2909         int err;
2910
2911         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2912                           NULL);
2913         if (err < 0)
2914                 goto errout;
2915
2916         err = -EINVAL;
2917         rtm = nlmsg_data(nlh);
2918         memset(cfg, 0, sizeof(*cfg));
2919
2920         cfg->fc_table = rtm->rtm_table;
2921         cfg->fc_dst_len = rtm->rtm_dst_len;
2922         cfg->fc_src_len = rtm->rtm_src_len;
2923         cfg->fc_flags = RTF_UP;
2924         cfg->fc_protocol = rtm->rtm_protocol;
2925         cfg->fc_type = rtm->rtm_type;
2926
2927         if (rtm->rtm_type == RTN_UNREACHABLE ||
2928             rtm->rtm_type == RTN_BLACKHOLE ||
2929             rtm->rtm_type == RTN_PROHIBIT ||
2930             rtm->rtm_type == RTN_THROW)
2931                 cfg->fc_flags |= RTF_REJECT;
2932
2933         if (rtm->rtm_type == RTN_LOCAL)
2934                 cfg->fc_flags |= RTF_LOCAL;
2935
2936         if (rtm->rtm_flags & RTM_F_CLONED)
2937                 cfg->fc_flags |= RTF_CACHE;
2938
2939         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2940         cfg->fc_nlinfo.nlh = nlh;
2941         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2942
2943         if (tb[RTA_GATEWAY]) {
2944                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2945                 cfg->fc_flags |= RTF_GATEWAY;
2946         }
2947
2948         if (tb[RTA_DST]) {
2949                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2950
2951                 if (nla_len(tb[RTA_DST]) < plen)
2952                         goto errout;
2953
2954                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2955         }
2956
2957         if (tb[RTA_SRC]) {
2958                 int plen = (rtm->rtm_src_len + 7) >> 3;
2959
2960                 if (nla_len(tb[RTA_SRC]) < plen)
2961                         goto errout;
2962
2963                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2964         }
2965
2966         if (tb[RTA_PREFSRC])
2967                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2968
2969         if (tb[RTA_OIF])
2970                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2971
2972         if (tb[RTA_PRIORITY])
2973                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2974
2975         if (tb[RTA_METRICS]) {
2976                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2977                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2978         }
2979
2980         if (tb[RTA_TABLE])
2981                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2982
2983         if (tb[RTA_MULTIPATH]) {
2984                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2985                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2986
2987                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2988                                                      cfg->fc_mp_len, extack);
2989                 if (err < 0)
2990                         goto errout;
2991         }
2992
2993         if (tb[RTA_PREF]) {
2994                 pref = nla_get_u8(tb[RTA_PREF]);
2995                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2996                     pref != ICMPV6_ROUTER_PREF_HIGH)
2997                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2998                 cfg->fc_flags |= RTF_PREF(pref);
2999         }
3000
3001         if (tb[RTA_ENCAP])
3002                 cfg->fc_encap = tb[RTA_ENCAP];
3003
3004         if (tb[RTA_ENCAP_TYPE]) {
3005                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3006
3007                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3008                 if (err < 0)
3009                         goto errout;
3010         }
3011
3012         if (tb[RTA_EXPIRES]) {
3013                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3014
3015                 if (addrconf_finite_timeout(timeout)) {
3016                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3017                         cfg->fc_flags |= RTF_EXPIRES;
3018                 }
3019         }
3020
3021         err = 0;
3022 errout:
3023         return err;
3024 }
3025
3026 struct rt6_nh {
3027         struct rt6_info *rt6_info;
3028         struct fib6_config r_cfg;
3029         struct mx6_config mxc;
3030         struct list_head next;
3031 };
3032
3033 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3034 {
3035         struct rt6_nh *nh;
3036
3037         list_for_each_entry(nh, rt6_nh_list, next) {
3038                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3039                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3040                         nh->r_cfg.fc_ifindex);
3041         }
3042 }
3043
3044 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3045                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3046 {
3047         struct rt6_nh *nh;
3048         int err = -EEXIST;
3049
3050         list_for_each_entry(nh, rt6_nh_list, next) {
3051                 /* check if rt6_info already exists */
3052                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3053                         return err;
3054         }
3055
3056         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3057         if (!nh)
3058                 return -ENOMEM;
3059         nh->rt6_info = rt;
3060         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3061         if (err) {
3062                 kfree(nh);
3063                 return err;
3064         }
3065         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3066         list_add_tail(&nh->next, rt6_nh_list);
3067
3068         return 0;
3069 }
3070
3071 static void ip6_route_mpath_notify(struct rt6_info *rt,
3072                                    struct rt6_info *rt_last,
3073                                    struct nl_info *info,
3074                                    __u16 nlflags)
3075 {
3076         /* if this is an APPEND route, then rt points to the first route
3077          * inserted and rt_last points to last route inserted. Userspace
3078          * wants a consistent dump of the route which starts at the first
3079          * nexthop. Since sibling routes are always added at the end of
3080          * the list, find the first sibling of the last route appended
3081          */
3082         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3083                 rt = list_first_entry(&rt_last->rt6i_siblings,
3084                                       struct rt6_info,
3085                                       rt6i_siblings);
3086         }
3087
3088         if (rt)
3089                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3090 }
3091
3092 static int ip6_route_multipath_add(struct fib6_config *cfg,
3093                                    struct netlink_ext_ack *extack)
3094 {
3095         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3096         struct nl_info *info = &cfg->fc_nlinfo;
3097         struct fib6_config r_cfg;
3098         struct rtnexthop *rtnh;
3099         struct rt6_info *rt;
3100         struct rt6_nh *err_nh;
3101         struct rt6_nh *nh, *nh_safe;
3102         __u16 nlflags;
3103         int remaining;
3104         int attrlen;
3105         int err = 1;
3106         int nhn = 0;
3107         int replace = (cfg->fc_nlinfo.nlh &&
3108                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3109         LIST_HEAD(rt6_nh_list);
3110
3111         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3112         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3113                 nlflags |= NLM_F_APPEND;
3114
3115         remaining = cfg->fc_mp_len;
3116         rtnh = (struct rtnexthop *)cfg->fc_mp;
3117
3118         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3119          * rt6_info structs per nexthop
3120          */
3121         while (rtnh_ok(rtnh, remaining)) {
3122                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3123                 if (rtnh->rtnh_ifindex)
3124                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3125
3126                 attrlen = rtnh_attrlen(rtnh);
3127                 if (attrlen > 0) {
3128                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3129
3130                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3131                         if (nla) {
3132                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3133                                 r_cfg.fc_flags |= RTF_GATEWAY;
3134                         }
3135                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3136                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3137                         if (nla)
3138                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3139                 }
3140
3141                 rt = ip6_route_info_create(&r_cfg, extack);
3142                 if (IS_ERR(rt)) {
3143                         err = PTR_ERR(rt);
3144                         rt = NULL;
3145                         goto cleanup;
3146                 }
3147
3148                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3149                 if (err) {
3150                         dst_release_immediate(&rt->dst);
3151                         goto cleanup;
3152                 }
3153
3154                 rtnh = rtnh_next(rtnh, &remaining);
3155         }
3156
3157         /* for add and replace send one notification with all nexthops.
3158          * Skip the notification in fib6_add_rt2node and send one with
3159          * the full route when done
3160          */
3161         info->skip_notify = 1;
3162
3163         err_nh = NULL;
3164         list_for_each_entry(nh, &rt6_nh_list, next) {
3165                 rt_last = nh->rt6_info;
3166                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3167                 /* save reference to first route for notification */
3168                 if (!rt_notif && !err)
3169                         rt_notif = nh->rt6_info;
3170
3171                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3172                 nh->rt6_info = NULL;
3173                 if (err) {
3174                         if (replace && nhn)
3175                                 ip6_print_replace_route_err(&rt6_nh_list);
3176                         err_nh = nh;
3177                         goto add_errout;
3178                 }
3179
3180                 /* Because each route is added like a single route we remove
3181                  * these flags after the first nexthop: if there is a collision,
3182                  * we have already failed to add the first nexthop:
3183                  * fib6_add_rt2node() has rejected it; when replacing, old
3184                  * nexthops have been replaced by first new, the rest should
3185                  * be added to it.
3186                  */
3187                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3188                                                      NLM_F_REPLACE);
3189                 nhn++;
3190         }
3191
3192         /* success ... tell user about new route */
3193         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3194         goto cleanup;
3195
3196 add_errout:
3197         /* send notification for routes that were added so that
3198          * the delete notifications sent by ip6_route_del are
3199          * coherent
3200          */
3201         if (rt_notif)
3202                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3203
3204         /* Delete routes that were already added */
3205         list_for_each_entry(nh, &rt6_nh_list, next) {
3206                 if (err_nh == nh)
3207                         break;
3208                 ip6_route_del(&nh->r_cfg, extack);
3209         }
3210
3211 cleanup:
3212         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3213                 if (nh->rt6_info)
3214                         dst_release_immediate(&nh->rt6_info->dst);
3215                 kfree(nh->mxc.mx);
3216                 list_del(&nh->next);
3217                 kfree(nh);
3218         }
3219
3220         return err;
3221 }
3222
3223 static int ip6_route_multipath_del(struct fib6_config *cfg,
3224                                    struct netlink_ext_ack *extack)
3225 {
3226         struct fib6_config r_cfg;
3227         struct rtnexthop *rtnh;
3228         int remaining;
3229         int attrlen;
3230         int err = 1, last_err = 0;
3231
3232         remaining = cfg->fc_mp_len;
3233         rtnh = (struct rtnexthop *)cfg->fc_mp;
3234
3235         /* Parse a Multipath Entry */
3236         while (rtnh_ok(rtnh, remaining)) {
3237                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3238                 if (rtnh->rtnh_ifindex)
3239                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3240
3241                 attrlen = rtnh_attrlen(rtnh);
3242                 if (attrlen > 0) {
3243                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3244
3245                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3246                         if (nla) {
3247                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3248                                 r_cfg.fc_flags |= RTF_GATEWAY;
3249                         }
3250                 }
3251                 err = ip6_route_del(&r_cfg, extack);
3252                 if (err)
3253                         last_err = err;
3254
3255                 rtnh = rtnh_next(rtnh, &remaining);
3256         }
3257
3258         return last_err;
3259 }
3260
3261 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3262                               struct netlink_ext_ack *extack)
3263 {
3264         struct fib6_config cfg;
3265         int err;
3266
3267         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3268         if (err < 0)
3269                 return err;
3270
3271         if (cfg.fc_mp)
3272                 return ip6_route_multipath_del(&cfg, extack);
3273         else {
3274                 cfg.fc_delete_all_nh = 1;
3275                 return ip6_route_del(&cfg, extack);
3276         }
3277 }
3278
3279 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3280                               struct netlink_ext_ack *extack)
3281 {
3282         struct fib6_config cfg;
3283         int err;
3284
3285         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3286         if (err < 0)
3287                 return err;
3288
3289         if (cfg.fc_mp)
3290                 return ip6_route_multipath_add(&cfg, extack);
3291         else
3292                 return ip6_route_add(&cfg, extack);
3293 }
3294
3295 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3296 {
3297         int nexthop_len = 0;
3298
3299         if (rt->rt6i_nsiblings) {
3300                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3301                             + NLA_ALIGN(sizeof(struct rtnexthop))
3302                             + nla_total_size(16) /* RTA_GATEWAY */
3303                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3304
3305                 nexthop_len *= rt->rt6i_nsiblings;
3306         }
3307
3308         return NLMSG_ALIGN(sizeof(struct rtmsg))
3309                + nla_total_size(16) /* RTA_SRC */
3310                + nla_total_size(16) /* RTA_DST */
3311                + nla_total_size(16) /* RTA_GATEWAY */
3312                + nla_total_size(16) /* RTA_PREFSRC */
3313                + nla_total_size(4) /* RTA_TABLE */
3314                + nla_total_size(4) /* RTA_IIF */
3315                + nla_total_size(4) /* RTA_OIF */
3316                + nla_total_size(4) /* RTA_PRIORITY */
3317                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3318                + nla_total_size(sizeof(struct rta_cacheinfo))
3319                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3320                + nla_total_size(1) /* RTA_PREF */
3321                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3322                + nexthop_len;
3323 }
3324
3325 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3326                             unsigned int *flags, bool skip_oif)
3327 {
3328         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3329                 *flags |= RTNH_F_LINKDOWN;
3330                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3331                         *flags |= RTNH_F_DEAD;
3332         }
3333
3334         if (rt->rt6i_flags & RTF_GATEWAY) {
3335                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3336                         goto nla_put_failure;
3337         }
3338
3339         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3340         if (!skip_oif && rt->dst.dev &&
3341             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3342                 goto nla_put_failure;
3343
3344         if (rt->dst.lwtstate &&
3345             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3346                 goto nla_put_failure;
3347
3348         return 0;
3349
3350 nla_put_failure:
3351         return -EMSGSIZE;
3352 }
3353
3354 /* add multipath next hop */
3355 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3356 {
3357         struct rtnexthop *rtnh;
3358         unsigned int flags = 0;
3359
3360         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3361         if (!rtnh)
3362                 goto nla_put_failure;
3363
3364         rtnh->rtnh_hops = 0;
3365         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3366
3367         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3368                 goto nla_put_failure;
3369
3370         rtnh->rtnh_flags = flags;
3371
3372         /* length of rtnetlink header + attributes */
3373         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3374
3375         return 0;
3376
3377 nla_put_failure:
3378         return -EMSGSIZE;
3379 }
3380
3381 static int rt6_fill_node(struct net *net,
3382                          struct sk_buff *skb, struct rt6_info *rt,
3383                          struct in6_addr *dst, struct in6_addr *src,
3384                          int iif, int type, u32 portid, u32 seq,
3385                          unsigned int flags)
3386 {
3387         u32 metrics[RTAX_MAX];
3388         struct rtmsg *rtm;
3389         struct nlmsghdr *nlh;
3390         long expires;
3391         u32 table;
3392
3393         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3394         if (!nlh)
3395                 return -EMSGSIZE;
3396
3397         rtm = nlmsg_data(nlh);
3398         rtm->rtm_family = AF_INET6;
3399         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3400         rtm->rtm_src_len = rt->rt6i_src.plen;
3401         rtm->rtm_tos = 0;
3402         if (rt->rt6i_table)
3403                 table = rt->rt6i_table->tb6_id;
3404         else
3405                 table = RT6_TABLE_UNSPEC;
3406         rtm->rtm_table = table;
3407         if (nla_put_u32(skb, RTA_TABLE, table))
3408                 goto nla_put_failure;
3409         if (rt->rt6i_flags & RTF_REJECT) {
3410                 switch (rt->dst.error) {
3411                 case -EINVAL:
3412                         rtm->rtm_type = RTN_BLACKHOLE;
3413                         break;
3414                 case -EACCES:
3415                         rtm->rtm_type = RTN_PROHIBIT;
3416                         break;
3417                 case -EAGAIN:
3418                         rtm->rtm_type = RTN_THROW;
3419                         break;
3420                 default:
3421                         rtm->rtm_type = RTN_UNREACHABLE;
3422                         break;
3423                 }
3424         }
3425         else if (rt->rt6i_flags & RTF_LOCAL)
3426                 rtm->rtm_type = RTN_LOCAL;
3427         else if (rt->rt6i_flags & RTF_ANYCAST)
3428                 rtm->rtm_type = RTN_ANYCAST;
3429         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3430                 rtm->rtm_type = RTN_LOCAL;
3431         else
3432                 rtm->rtm_type = RTN_UNICAST;
3433         rtm->rtm_flags = 0;
3434         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3435         rtm->rtm_protocol = rt->rt6i_protocol;
3436
3437         if (rt->rt6i_flags & RTF_CACHE)
3438                 rtm->rtm_flags |= RTM_F_CLONED;
3439
3440         if (dst) {
3441                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3442                         goto nla_put_failure;
3443                 rtm->rtm_dst_len = 128;
3444         } else if (rtm->rtm_dst_len)
3445                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3446                         goto nla_put_failure;
3447 #ifdef CONFIG_IPV6_SUBTREES
3448         if (src) {
3449                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3450                         goto nla_put_failure;
3451                 rtm->rtm_src_len = 128;
3452         } else if (rtm->rtm_src_len &&
3453                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3454                 goto nla_put_failure;
3455 #endif
3456         if (iif) {
3457 #ifdef CONFIG_IPV6_MROUTE
3458                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3459                         int err = ip6mr_get_route(net, skb, rtm, portid);
3460
3461                         if (err == 0)
3462                                 return 0;
3463                         if (err < 0)
3464                                 goto nla_put_failure;
3465                 } else
3466 #endif
3467                         if (nla_put_u32(skb, RTA_IIF, iif))
3468                                 goto nla_put_failure;
3469         } else if (dst) {
3470                 struct in6_addr saddr_buf;
3471                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3472                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3473                         goto nla_put_failure;
3474         }
3475
3476         if (rt->rt6i_prefsrc.plen) {
3477                 struct in6_addr saddr_buf;
3478                 saddr_buf = rt->rt6i_prefsrc.addr;
3479                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3480                         goto nla_put_failure;
3481         }
3482
3483         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3484         if (rt->rt6i_pmtu)
3485                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3486         if (rtnetlink_put_metrics(skb, metrics) < 0)
3487                 goto nla_put_failure;
3488
3489         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3490                 goto nla_put_failure;
3491
3492         /* For multipath routes, walk the siblings list and add
3493          * each as a nexthop within RTA_MULTIPATH.
3494          */
3495         if (rt->rt6i_nsiblings) {
3496                 struct rt6_info *sibling, *next_sibling;
3497                 struct nlattr *mp;
3498
3499                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3500                 if (!mp)
3501                         goto nla_put_failure;
3502
3503                 if (rt6_add_nexthop(skb, rt) < 0)
3504                         goto nla_put_failure;
3505
3506                 list_for_each_entry_safe(sibling, next_sibling,
3507                                          &rt->rt6i_siblings, rt6i_siblings) {
3508                         if (rt6_add_nexthop(skb, sibling) < 0)
3509                                 goto nla_put_failure;
3510                 }
3511
3512                 nla_nest_end(skb, mp);
3513         } else {
3514                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3515                         goto nla_put_failure;
3516         }
3517
3518         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3519
3520         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3521                 goto nla_put_failure;
3522
3523         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3524                 goto nla_put_failure;
3525
3526
3527         nlmsg_end(skb, nlh);
3528         return 0;
3529
3530 nla_put_failure:
3531         nlmsg_cancel(skb, nlh);
3532         return -EMSGSIZE;
3533 }
3534
3535 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3536 {
3537         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3538         struct net *net = arg->net;
3539
3540         if (rt == net->ipv6.ip6_null_entry)
3541                 return 0;
3542
3543         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3544                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3545
3546                 /* user wants prefix routes only */
3547                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3548                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3549                         /* success since this is not a prefix route */
3550                         return 1;
3551                 }
3552         }
3553
3554         return rt6_fill_node(net,
3555                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3556                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3557                      NLM_F_MULTI);
3558 }
3559
3560 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3561                               struct netlink_ext_ack *extack)
3562 {
3563         struct net *net = sock_net(in_skb->sk);
3564         struct nlattr *tb[RTA_MAX+1];
3565         int err, iif = 0, oif = 0;
3566         struct dst_entry *dst;
3567         struct rt6_info *rt;
3568         struct sk_buff *skb;
3569         struct rtmsg *rtm;
3570         struct flowi6 fl6;
3571         bool fibmatch;
3572
3573         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3574                           extack);
3575         if (err < 0)
3576                 goto errout;
3577
3578         err = -EINVAL;
3579         memset(&fl6, 0, sizeof(fl6));
3580         rtm = nlmsg_data(nlh);
3581         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3582         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3583
3584         if (tb[RTA_SRC]) {
3585                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3586                         goto errout;
3587
3588                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3589         }
3590
3591         if (tb[RTA_DST]) {
3592                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3593                         goto errout;
3594
3595                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3596         }
3597
3598         if (tb[RTA_IIF])
3599                 iif = nla_get_u32(tb[RTA_IIF]);
3600
3601         if (tb[RTA_OIF])
3602                 oif = nla_get_u32(tb[RTA_OIF]);
3603
3604         if (tb[RTA_MARK])
3605                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3606
3607         if (tb[RTA_UID])
3608                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3609                                            nla_get_u32(tb[RTA_UID]));
3610         else
3611                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3612
3613         if (iif) {
3614                 struct net_device *dev;
3615                 int flags = 0;
3616
3617                 dev = __dev_get_by_index(net, iif);
3618                 if (!dev) {
3619                         err = -ENODEV;
3620                         goto errout;
3621                 }
3622
3623                 fl6.flowi6_iif = iif;
3624
3625                 if (!ipv6_addr_any(&fl6.saddr))
3626                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3627
3628                 if (!fibmatch)
3629                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3630         } else {
3631                 fl6.flowi6_oif = oif;
3632
3633                 if (!fibmatch)
3634                         dst = ip6_route_output(net, NULL, &fl6);
3635         }
3636
3637         if (fibmatch)
3638                 dst = ip6_route_lookup(net, &fl6, 0);
3639
3640         rt = container_of(dst, struct rt6_info, dst);
3641         if (rt->dst.error) {
3642                 err = rt->dst.error;
3643                 ip6_rt_put(rt);
3644                 goto errout;
3645         }
3646
3647         if (rt == net->ipv6.ip6_null_entry) {
3648                 err = rt->dst.error;
3649                 ip6_rt_put(rt);
3650                 goto errout;
3651         }
3652
3653         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3654         if (!skb) {
3655                 ip6_rt_put(rt);
3656                 err = -ENOBUFS;
3657                 goto errout;
3658         }
3659
3660         skb_dst_set(skb, &rt->dst);
3661         if (fibmatch)
3662                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3663                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3664                                     nlh->nlmsg_seq, 0);
3665         else
3666                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3667                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3668                                     nlh->nlmsg_seq, 0);
3669         if (err < 0) {
3670                 kfree_skb(skb);
3671                 goto errout;
3672         }
3673
3674         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3675 errout:
3676         return err;
3677 }
3678
3679 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3680                      unsigned int nlm_flags)
3681 {
3682         struct sk_buff *skb;
3683         struct net *net = info->nl_net;
3684         u32 seq;
3685         int err;
3686
3687         err = -ENOBUFS;
3688         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3689
3690         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3691         if (!skb)
3692                 goto errout;
3693
3694         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3695                                 event, info->portid, seq, nlm_flags);
3696         if (err < 0) {
3697                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3698                 WARN_ON(err == -EMSGSIZE);
3699                 kfree_skb(skb);
3700                 goto errout;
3701         }
3702         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3703                     info->nlh, gfp_any());
3704         return;
3705 errout:
3706         if (err < 0)
3707                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3708 }
3709
3710 static int ip6_route_dev_notify(struct notifier_block *this,
3711                                 unsigned long event, void *ptr)
3712 {
3713         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3714         struct net *net = dev_net(dev);
3715
3716         if (!(dev->flags & IFF_LOOPBACK))
3717                 return NOTIFY_OK;
3718
3719         if (event == NETDEV_REGISTER) {
3720                 net->ipv6.ip6_null_entry->dst.dev = dev;
3721                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3722 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3723                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3724                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3725                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3726                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3727 #endif
3728          } else if (event == NETDEV_UNREGISTER &&
3729                     dev->reg_state != NETREG_UNREGISTERED) {
3730                 /* NETDEV_UNREGISTER could be fired for multiple times by
3731                  * netdev_wait_allrefs(). Make sure we only call this once.
3732                  */
3733                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3734 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3735                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3736                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3737 #endif
3738         }
3739
3740         return NOTIFY_OK;
3741 }
3742
3743 /*
3744  *      /proc
3745  */
3746
3747 #ifdef CONFIG_PROC_FS
3748
3749 static const struct file_operations ipv6_route_proc_fops = {
3750         .owner          = THIS_MODULE,
3751         .open           = ipv6_route_open,
3752         .read           = seq_read,
3753         .llseek         = seq_lseek,
3754         .release        = seq_release_net,
3755 };
3756
3757 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3758 {
3759         struct net *net = (struct net *)seq->private;
3760         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3761                    net->ipv6.rt6_stats->fib_nodes,
3762                    net->ipv6.rt6_stats->fib_route_nodes,
3763                    net->ipv6.rt6_stats->fib_rt_alloc,
3764                    net->ipv6.rt6_stats->fib_rt_entries,
3765                    net->ipv6.rt6_stats->fib_rt_cache,
3766                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3767                    net->ipv6.rt6_stats->fib_discarded_routes);
3768
3769         return 0;
3770 }
3771
3772 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3773 {
3774         return single_open_net(inode, file, rt6_stats_seq_show);
3775 }
3776
3777 static const struct file_operations rt6_stats_seq_fops = {
3778         .owner   = THIS_MODULE,
3779         .open    = rt6_stats_seq_open,
3780         .read    = seq_read,
3781         .llseek  = seq_lseek,
3782         .release = single_release_net,
3783 };
3784 #endif  /* CONFIG_PROC_FS */
3785
3786 #ifdef CONFIG_SYSCTL
3787
3788 static
3789 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3790                               void __user *buffer, size_t *lenp, loff_t *ppos)
3791 {
3792         struct net *net;
3793         int delay;
3794         if (!write)
3795                 return -EINVAL;
3796
3797         net = (struct net *)ctl->extra1;
3798         delay = net->ipv6.sysctl.flush_delay;
3799         proc_dointvec(ctl, write, buffer, lenp, ppos);
3800         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3801         return 0;
3802 }
3803
3804 struct ctl_table ipv6_route_table_template[] = {
3805         {
3806                 .procname       =       "flush",
3807                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3808                 .maxlen         =       sizeof(int),
3809                 .mode           =       0200,
3810                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3811         },
3812         {
3813                 .procname       =       "gc_thresh",
3814                 .data           =       &ip6_dst_ops_template.gc_thresh,
3815                 .maxlen         =       sizeof(int),
3816                 .mode           =       0644,
3817                 .proc_handler   =       proc_dointvec,
3818         },
3819         {
3820                 .procname       =       "max_size",
3821                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3822                 .maxlen         =       sizeof(int),
3823                 .mode           =       0644,
3824                 .proc_handler   =       proc_dointvec,
3825         },
3826         {
3827                 .procname       =       "gc_min_interval",
3828                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3829                 .maxlen         =       sizeof(int),
3830                 .mode           =       0644,
3831                 .proc_handler   =       proc_dointvec_jiffies,
3832         },
3833         {
3834                 .procname       =       "gc_timeout",
3835                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3836                 .maxlen         =       sizeof(int),
3837                 .mode           =       0644,
3838                 .proc_handler   =       proc_dointvec_jiffies,
3839         },
3840         {
3841                 .procname       =       "gc_interval",
3842                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3843                 .maxlen         =       sizeof(int),
3844                 .mode           =       0644,
3845                 .proc_handler   =       proc_dointvec_jiffies,
3846         },
3847         {
3848                 .procname       =       "gc_elasticity",
3849                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3850                 .maxlen         =       sizeof(int),
3851                 .mode           =       0644,
3852                 .proc_handler   =       proc_dointvec,
3853         },
3854         {
3855                 .procname       =       "mtu_expires",
3856                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3857                 .maxlen         =       sizeof(int),
3858                 .mode           =       0644,
3859                 .proc_handler   =       proc_dointvec_jiffies,
3860         },
3861         {
3862                 .procname       =       "min_adv_mss",
3863                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3864                 .maxlen         =       sizeof(int),
3865                 .mode           =       0644,
3866                 .proc_handler   =       proc_dointvec,
3867         },
3868         {
3869                 .procname       =       "gc_min_interval_ms",
3870                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3871                 .maxlen         =       sizeof(int),
3872                 .mode           =       0644,
3873                 .proc_handler   =       proc_dointvec_ms_jiffies,
3874         },
3875         { }
3876 };
3877
3878 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3879 {
3880         struct ctl_table *table;
3881
3882         table = kmemdup(ipv6_route_table_template,
3883                         sizeof(ipv6_route_table_template),
3884                         GFP_KERNEL);
3885
3886         if (table) {
3887                 table[0].data = &net->ipv6.sysctl.flush_delay;
3888                 table[0].extra1 = net;
3889                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3890                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3891                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3892                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3893                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3894                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3895                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3896                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3897                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3898
3899                 /* Don't export sysctls to unprivileged users */
3900                 if (net->user_ns != &init_user_ns)
3901                         table[0].procname = NULL;
3902         }
3903
3904         return table;
3905 }
3906 #endif
3907
3908 static int __net_init ip6_route_net_init(struct net *net)
3909 {
3910         int ret = -ENOMEM;
3911
3912         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3913                sizeof(net->ipv6.ip6_dst_ops));
3914
3915         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3916                 goto out_ip6_dst_ops;
3917
3918         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3919                                            sizeof(*net->ipv6.ip6_null_entry),
3920                                            GFP_KERNEL);
3921         if (!net->ipv6.ip6_null_entry)
3922                 goto out_ip6_dst_entries;
3923         net->ipv6.ip6_null_entry->dst.path =
3924                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3925         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3926         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3927                          ip6_template_metrics, true);
3928
3929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3930         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3931                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3932                                                GFP_KERNEL);
3933         if (!net->ipv6.ip6_prohibit_entry)
3934                 goto out_ip6_null_entry;
3935         net->ipv6.ip6_prohibit_entry->dst.path =
3936                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3937         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3938         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3939                          ip6_template_metrics, true);
3940
3941         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3942                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3943                                                GFP_KERNEL);
3944         if (!net->ipv6.ip6_blk_hole_entry)
3945                 goto out_ip6_prohibit_entry;
3946         net->ipv6.ip6_blk_hole_entry->dst.path =
3947                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3948         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3949         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3950                          ip6_template_metrics, true);
3951 #endif
3952
3953         net->ipv6.sysctl.flush_delay = 0;
3954         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3955         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3956         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3957         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3958         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3959         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3960         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3961
3962         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3963
3964         ret = 0;
3965 out:
3966         return ret;
3967
3968 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3969 out_ip6_prohibit_entry:
3970         kfree(net->ipv6.ip6_prohibit_entry);
3971 out_ip6_null_entry:
3972         kfree(net->ipv6.ip6_null_entry);
3973 #endif
3974 out_ip6_dst_entries:
3975         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3976 out_ip6_dst_ops:
3977         goto out;
3978 }
3979
3980 static void __net_exit ip6_route_net_exit(struct net *net)
3981 {
3982         kfree(net->ipv6.ip6_null_entry);
3983 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3984         kfree(net->ipv6.ip6_prohibit_entry);
3985         kfree(net->ipv6.ip6_blk_hole_entry);
3986 #endif
3987         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3988 }
3989
3990 static int __net_init ip6_route_net_init_late(struct net *net)
3991 {
3992 #ifdef CONFIG_PROC_FS
3993         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3994         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3995 #endif
3996         return 0;
3997 }
3998
3999 static void __net_exit ip6_route_net_exit_late(struct net *net)
4000 {
4001 #ifdef CONFIG_PROC_FS
4002         remove_proc_entry("ipv6_route", net->proc_net);
4003         remove_proc_entry("rt6_stats", net->proc_net);
4004 #endif
4005 }
4006
4007 static struct pernet_operations ip6_route_net_ops = {
4008         .init = ip6_route_net_init,
4009         .exit = ip6_route_net_exit,
4010 };
4011
4012 static int __net_init ipv6_inetpeer_init(struct net *net)
4013 {
4014         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4015
4016         if (!bp)
4017                 return -ENOMEM;
4018         inet_peer_base_init(bp);
4019         net->ipv6.peers = bp;
4020         return 0;
4021 }
4022
4023 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4024 {
4025         struct inet_peer_base *bp = net->ipv6.peers;
4026
4027         net->ipv6.peers = NULL;
4028         inetpeer_invalidate_tree(bp);
4029         kfree(bp);
4030 }
4031
4032 static struct pernet_operations ipv6_inetpeer_ops = {
4033         .init   =       ipv6_inetpeer_init,
4034         .exit   =       ipv6_inetpeer_exit,
4035 };
4036
4037 static struct pernet_operations ip6_route_net_late_ops = {
4038         .init = ip6_route_net_init_late,
4039         .exit = ip6_route_net_exit_late,
4040 };
4041
4042 static struct notifier_block ip6_route_dev_notifier = {
4043         .notifier_call = ip6_route_dev_notify,
4044         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4045 };
4046
4047 void __init ip6_route_init_special_entries(void)
4048 {
4049         /* Registering of the loopback is done before this portion of code,
4050          * the loopback reference in rt6_info will not be taken, do it
4051          * manually for init_net */
4052         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4053         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4054   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4055         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4056         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4057         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4058         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4059   #endif
4060 }
4061
4062 int __init ip6_route_init(void)
4063 {
4064         int ret;
4065         int cpu;
4066
4067         ret = -ENOMEM;
4068         ip6_dst_ops_template.kmem_cachep =
4069                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4070                                   SLAB_HWCACHE_ALIGN, NULL);
4071         if (!ip6_dst_ops_template.kmem_cachep)
4072                 goto out;
4073
4074         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4075         if (ret)
4076                 goto out_kmem_cache;
4077
4078         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4079         if (ret)
4080                 goto out_dst_entries;
4081
4082         ret = register_pernet_subsys(&ip6_route_net_ops);
4083         if (ret)
4084                 goto out_register_inetpeer;
4085
4086         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4087
4088         ret = fib6_init();
4089         if (ret)
4090                 goto out_register_subsys;
4091
4092         ret = xfrm6_init();
4093         if (ret)
4094                 goto out_fib6_init;
4095
4096         ret = fib6_rules_init();
4097         if (ret)
4098                 goto xfrm6_init;
4099
4100         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4101         if (ret)
4102                 goto fib6_rules_init;
4103
4104         ret = -ENOBUFS;
4105         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4106             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4107             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4108                 goto out_register_late_subsys;
4109
4110         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4111         if (ret)
4112                 goto out_register_late_subsys;
4113
4114         for_each_possible_cpu(cpu) {
4115                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4116
4117                 INIT_LIST_HEAD(&ul->head);
4118                 spin_lock_init(&ul->lock);
4119         }
4120
4121 out:
4122         return ret;
4123
4124 out_register_late_subsys:
4125         unregister_pernet_subsys(&ip6_route_net_late_ops);
4126 fib6_rules_init:
4127         fib6_rules_cleanup();
4128 xfrm6_init:
4129         xfrm6_fini();
4130 out_fib6_init:
4131         fib6_gc_cleanup();
4132 out_register_subsys:
4133         unregister_pernet_subsys(&ip6_route_net_ops);
4134 out_register_inetpeer:
4135         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4136 out_dst_entries:
4137         dst_entries_destroy(&ip6_dst_blackhole_ops);
4138 out_kmem_cache:
4139         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4140         goto out;
4141 }
4142
4143 void ip6_route_cleanup(void)
4144 {
4145         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4146         unregister_pernet_subsys(&ip6_route_net_late_ops);
4147         fib6_rules_cleanup();
4148         xfrm6_fini();
4149         fib6_gc_cleanup();
4150         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4151         unregister_pernet_subsys(&ip6_route_net_ops);
4152         dst_entries_destroy(&ip6_dst_blackhole_ops);
4153         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4154 }