net: rtnetlink: plumb extended ack to doit function
[platform/kernel/linux-rpi.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->dst.flags |= DST_NOCACHE;
132         rt->rt6i_uncached_list = ul;
133
134         spin_lock_bh(&ul->lock);
135         list_add_tail(&rt->rt6i_uncached, &ul->head);
136         spin_unlock_bh(&ul->lock);
137 }
138
139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141         if (!list_empty(&rt->rt6i_uncached)) {
142                 struct uncached_list *ul = rt->rt6i_uncached_list;
143
144                 spin_lock_bh(&ul->lock);
145                 list_del(&rt->rt6i_uncached);
146                 spin_unlock_bh(&ul->lock);
147         }
148 }
149
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152         struct net_device *loopback_dev = net->loopback_dev;
153         int cpu;
154
155         if (dev == loopback_dev)
156                 return;
157
158         for_each_possible_cpu(cpu) {
159                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160                 struct rt6_info *rt;
161
162                 spin_lock_bh(&ul->lock);
163                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164                         struct inet6_dev *rt_idev = rt->rt6i_idev;
165                         struct net_device *rt_dev = rt->dst.dev;
166
167                         if (rt_idev->dev == dev) {
168                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
169                                 in6_dev_put(rt_idev);
170                         }
171
172                         if (rt_dev == dev) {
173                                 rt->dst.dev = loopback_dev;
174                                 dev_hold(rt->dst.dev);
175                                 dev_put(rt_dev);
176                         }
177                 }
178                 spin_unlock_bh(&ul->lock);
179         }
180 }
181
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184         return dst_metrics_write_ptr(rt->dst.from);
185 }
186
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190
191         if (rt->rt6i_flags & RTF_PCPU)
192                 return rt6_pcpu_cow_metrics(rt);
193         else if (rt->rt6i_flags & RTF_CACHE)
194                 return NULL;
195         else
196                 return dst_cow_metrics_generic(dst, old);
197 }
198
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200                                              struct sk_buff *skb,
201                                              const void *daddr)
202 {
203         struct in6_addr *p = &rt->rt6i_gateway;
204
205         if (!ipv6_addr_any(p))
206                 return (const void *) p;
207         else if (skb)
208                 return &ipv6_hdr(skb)->daddr;
209         return daddr;
210 }
211
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213                                           struct sk_buff *skb,
214                                           const void *daddr)
215 {
216         struct rt6_info *rt = (struct rt6_info *) dst;
217         struct neighbour *n;
218
219         daddr = choose_neigh_daddr(rt, skb, daddr);
220         n = __ipv6_neigh_lookup(dst->dev, daddr);
221         if (n)
222                 return n;
223         return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228         struct net_device *dev = dst->dev;
229         struct rt6_info *rt = (struct rt6_info *)dst;
230
231         daddr = choose_neigh_daddr(rt, NULL, daddr);
232         if (!daddr)
233                 return;
234         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235                 return;
236         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237                 return;
238         __ipv6_confirm_neigh(dev, daddr);
239 }
240
241 static struct dst_ops ip6_dst_ops_template = {
242         .family                 =       AF_INET6,
243         .gc                     =       ip6_dst_gc,
244         .gc_thresh              =       1024,
245         .check                  =       ip6_dst_check,
246         .default_advmss         =       ip6_default_advmss,
247         .mtu                    =       ip6_mtu,
248         .cow_metrics            =       ipv6_cow_metrics,
249         .destroy                =       ip6_dst_destroy,
250         .ifdown                 =       ip6_dst_ifdown,
251         .negative_advice        =       ip6_negative_advice,
252         .link_failure           =       ip6_link_failure,
253         .update_pmtu            =       ip6_rt_update_pmtu,
254         .redirect               =       rt6_do_redirect,
255         .local_out              =       __ip6_local_out,
256         .neigh_lookup           =       ip6_neigh_lookup,
257         .confirm_neigh          =       ip6_confirm_neigh,
258 };
259
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263
264         return mtu ? : dst->dev->mtu;
265 }
266
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268                                          struct sk_buff *skb, u32 mtu)
269 {
270 }
271
272 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
273                                       struct sk_buff *skb)
274 {
275 }
276
277 static struct dst_ops ip6_dst_blackhole_ops = {
278         .family                 =       AF_INET6,
279         .destroy                =       ip6_dst_destroy,
280         .check                  =       ip6_dst_check,
281         .mtu                    =       ip6_blackhole_mtu,
282         .default_advmss         =       ip6_default_advmss,
283         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
284         .redirect               =       ip6_rt_blackhole_redirect,
285         .cow_metrics            =       dst_cow_metrics_generic,
286         .neigh_lookup           =       ip6_neigh_lookup,
287 };
288
289 static const u32 ip6_template_metrics[RTAX_MAX] = {
290         [RTAX_HOPLIMIT - 1] = 0,
291 };
292
293 static const struct rt6_info ip6_null_entry_template = {
294         .dst = {
295                 .__refcnt       = ATOMIC_INIT(1),
296                 .__use          = 1,
297                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
298                 .error          = -ENETUNREACH,
299                 .input          = ip6_pkt_discard,
300                 .output         = ip6_pkt_discard_out,
301         },
302         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
303         .rt6i_protocol  = RTPROT_KERNEL,
304         .rt6i_metric    = ~(u32) 0,
305         .rt6i_ref       = ATOMIC_INIT(1),
306 };
307
308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309
310 static const struct rt6_info ip6_prohibit_entry_template = {
311         .dst = {
312                 .__refcnt       = ATOMIC_INIT(1),
313                 .__use          = 1,
314                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
315                 .error          = -EACCES,
316                 .input          = ip6_pkt_prohibit,
317                 .output         = ip6_pkt_prohibit_out,
318         },
319         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
320         .rt6i_protocol  = RTPROT_KERNEL,
321         .rt6i_metric    = ~(u32) 0,
322         .rt6i_ref       = ATOMIC_INIT(1),
323 };
324
325 static const struct rt6_info ip6_blk_hole_entry_template = {
326         .dst = {
327                 .__refcnt       = ATOMIC_INIT(1),
328                 .__use          = 1,
329                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
330                 .error          = -EINVAL,
331                 .input          = dst_discard,
332                 .output         = dst_discard_out,
333         },
334         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
335         .rt6i_protocol  = RTPROT_KERNEL,
336         .rt6i_metric    = ~(u32) 0,
337         .rt6i_ref       = ATOMIC_INIT(1),
338 };
339
340 #endif
341
342 static void rt6_info_init(struct rt6_info *rt)
343 {
344         struct dst_entry *dst = &rt->dst;
345
346         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
347         INIT_LIST_HEAD(&rt->rt6i_siblings);
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 static struct rt6_info *__ip6_dst_alloc(struct net *net,
353                                         struct net_device *dev,
354                                         int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         0, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt)
360                 rt6_info_init(rt);
361
362         return rt;
363 }
364
365 struct rt6_info *ip6_dst_alloc(struct net *net,
366                                struct net_device *dev,
367                                int flags)
368 {
369         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370
371         if (rt) {
372                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
373                 if (rt->rt6i_pcpu) {
374                         int cpu;
375
376                         for_each_possible_cpu(cpu) {
377                                 struct rt6_info **p;
378
379                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
380                                 /* no one shares rt */
381                                 *p =  NULL;
382                         }
383                 } else {
384                         dst_destroy((struct dst_entry *)rt);
385                         return NULL;
386                 }
387         }
388
389         return rt;
390 }
391 EXPORT_SYMBOL(ip6_dst_alloc);
392
393 static void ip6_dst_destroy(struct dst_entry *dst)
394 {
395         struct rt6_info *rt = (struct rt6_info *)dst;
396         struct dst_entry *from = dst->from;
397         struct inet6_dev *idev;
398
399         dst_destroy_metrics_generic(dst);
400         free_percpu(rt->rt6i_pcpu);
401         rt6_uncached_list_del(rt);
402
403         idev = rt->rt6i_idev;
404         if (idev) {
405                 rt->rt6i_idev = NULL;
406                 in6_dev_put(idev);
407         }
408
409         dst->from = NULL;
410         dst_release(from);
411 }
412
413 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
414                            int how)
415 {
416         struct rt6_info *rt = (struct rt6_info *)dst;
417         struct inet6_dev *idev = rt->rt6i_idev;
418         struct net_device *loopback_dev =
419                 dev_net(dev)->loopback_dev;
420
421         if (dev != loopback_dev) {
422                 if (idev && idev->dev == dev) {
423                         struct inet6_dev *loopback_idev =
424                                 in6_dev_get(loopback_dev);
425                         if (loopback_idev) {
426                                 rt->rt6i_idev = loopback_idev;
427                                 in6_dev_put(idev);
428                         }
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
448         }
449         return false;
450 }
451
452 /* Multipath route selection:
453  *   Hash based function using packet header and flowlabel.
454  * Adapted from fib_info_hashfn()
455  */
456 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
457                                const struct flowi6 *fl6)
458 {
459         return get_hash_from_flowi6(fl6) % candidate_count;
460 }
461
462 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
463                                              struct flowi6 *fl6, int oif,
464                                              int strict)
465 {
466         struct rt6_info *sibling, *next_sibling;
467         int route_choosen;
468
469         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
470         /* Don't change the route, if route_choosen == 0
471          * (siblings does not include ourself)
472          */
473         if (route_choosen)
474                 list_for_each_entry_safe(sibling, next_sibling,
475                                 &match->rt6i_siblings, rt6i_siblings) {
476                         route_choosen--;
477                         if (route_choosen == 0) {
478                                 if (rt6_score_route(sibling, oif, strict) < 0)
479                                         break;
480                                 match = sibling;
481                                 break;
482                         }
483                 }
484         return match;
485 }
486
487 /*
488  *      Route lookup. Any table->tb6_lock is implied.
489  */
490
491 static inline struct rt6_info *rt6_device_match(struct net *net,
492                                                     struct rt6_info *rt,
493                                                     const struct in6_addr *saddr,
494                                                     int oif,
495                                                     int flags)
496 {
497         struct rt6_info *local = NULL;
498         struct rt6_info *sprt;
499
500         if (!oif && ipv6_addr_any(saddr))
501                 goto out;
502
503         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
504                 struct net_device *dev = sprt->dst.dev;
505
506                 if (oif) {
507                         if (dev->ifindex == oif)
508                                 return sprt;
509                         if (dev->flags & IFF_LOOPBACK) {
510                                 if (!sprt->rt6i_idev ||
511                                     sprt->rt6i_idev->dev->ifindex != oif) {
512                                         if (flags & RT6_LOOKUP_F_IFACE)
513                                                 continue;
514                                         if (local &&
515                                             local->rt6i_idev->dev->ifindex == oif)
516                                                 continue;
517                                 }
518                                 local = sprt;
519                         }
520                 } else {
521                         if (ipv6_chk_addr(net, saddr, dev,
522                                           flags & RT6_LOOKUP_F_IFACE))
523                                 return sprt;
524                 }
525         }
526
527         if (oif) {
528                 if (local)
529                         return local;
530
531                 if (flags & RT6_LOOKUP_F_IFACE)
532                         return net->ipv6.ip6_null_entry;
533         }
534 out:
535         return rt;
536 }
537
538 #ifdef CONFIG_IPV6_ROUTER_PREF
539 struct __rt6_probe_work {
540         struct work_struct work;
541         struct in6_addr target;
542         struct net_device *dev;
543 };
544
545 static void rt6_probe_deferred(struct work_struct *w)
546 {
547         struct in6_addr mcaddr;
548         struct __rt6_probe_work *work =
549                 container_of(w, struct __rt6_probe_work, work);
550
551         addrconf_addr_solict_mult(&work->target, &mcaddr);
552         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
553         dev_put(work->dev);
554         kfree(work);
555 }
556
557 static void rt6_probe(struct rt6_info *rt)
558 {
559         struct __rt6_probe_work *work;
560         struct neighbour *neigh;
561         /*
562          * Okay, this does not seem to be appropriate
563          * for now, however, we need to check if it
564          * is really so; aka Router Reachability Probing.
565          *
566          * Router Reachability Probe MUST be rate-limited
567          * to no more than one per minute.
568          */
569         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
570                 return;
571         rcu_read_lock_bh();
572         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573         if (neigh) {
574                 if (neigh->nud_state & NUD_VALID)
575                         goto out;
576
577                 work = NULL;
578                 write_lock(&neigh->lock);
579                 if (!(neigh->nud_state & NUD_VALID) &&
580                     time_after(jiffies,
581                                neigh->updated +
582                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
583                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
584                         if (work)
585                                 __neigh_set_probe_once(neigh);
586                 }
587                 write_unlock(&neigh->lock);
588         } else {
589                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590         }
591
592         if (work) {
593                 INIT_WORK(&work->work, rt6_probe_deferred);
594                 work->target = rt->rt6i_gateway;
595                 dev_hold(rt->dst.dev);
596                 work->dev = rt->dst.dev;
597                 schedule_work(&work->work);
598         }
599
600 out:
601         rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct rt6_info *rt)
605 {
606 }
607 #endif
608
609 /*
610  * Default Router Selection (RFC 2461 6.3.6)
611  */
612 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 {
614         struct net_device *dev = rt->dst.dev;
615         if (!oif || dev->ifindex == oif)
616                 return 2;
617         if ((dev->flags & IFF_LOOPBACK) &&
618             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
619                 return 1;
620         return 0;
621 }
622
623 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 {
625         struct neighbour *neigh;
626         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627
628         if (rt->rt6i_flags & RTF_NONEXTHOP ||
629             !(rt->rt6i_flags & RTF_GATEWAY))
630                 return RT6_NUD_SUCCEED;
631
632         rcu_read_lock_bh();
633         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634         if (neigh) {
635                 read_lock(&neigh->lock);
636                 if (neigh->nud_state & NUD_VALID)
637                         ret = RT6_NUD_SUCCEED;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639                 else if (!(neigh->nud_state & NUD_FAILED))
640                         ret = RT6_NUD_SUCCEED;
641                 else
642                         ret = RT6_NUD_FAIL_PROBE;
643 #endif
644                 read_unlock(&neigh->lock);
645         } else {
646                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
647                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648         }
649         rcu_read_unlock_bh();
650
651         return ret;
652 }
653
654 static int rt6_score_route(struct rt6_info *rt, int oif,
655                            int strict)
656 {
657         int m;
658
659         m = rt6_check_dev(rt, oif);
660         if (!m && (strict & RT6_LOOKUP_F_IFACE))
661                 return RT6_NUD_FAIL_HARD;
662 #ifdef CONFIG_IPV6_ROUTER_PREF
663         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 #endif
665         if (strict & RT6_LOOKUP_F_REACHABLE) {
666                 int n = rt6_check_neigh(rt);
667                 if (n < 0)
668                         return n;
669         }
670         return m;
671 }
672
673 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
674                                    int *mpri, struct rt6_info *match,
675                                    bool *do_rr)
676 {
677         int m;
678         bool match_do_rr = false;
679         struct inet6_dev *idev = rt->rt6i_idev;
680         struct net_device *dev = rt->dst.dev;
681
682         if (dev && !netif_carrier_ok(dev) &&
683             idev->cnf.ignore_routes_with_linkdown &&
684             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
685                 goto out;
686
687         if (rt6_check_expired(rt))
688                 goto out;
689
690         m = rt6_score_route(rt, oif, strict);
691         if (m == RT6_NUD_FAIL_DO_RR) {
692                 match_do_rr = true;
693                 m = 0; /* lowest valid score */
694         } else if (m == RT6_NUD_FAIL_HARD) {
695                 goto out;
696         }
697
698         if (strict & RT6_LOOKUP_F_REACHABLE)
699                 rt6_probe(rt);
700
701         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702         if (m > *mpri) {
703                 *do_rr = match_do_rr;
704                 *mpri = m;
705                 match = rt;
706         }
707 out:
708         return match;
709 }
710
711 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
731                 if (rt->rt6i_metric != metric) {
732                         cont = rt;
733                         break;
734                 }
735
736                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737         }
738
739         if (match || !cont)
740                 return match;
741
742         for (rt = cont; rt; rt = rt->dst.rt6_next)
743                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744
745         return match;
746 }
747
748 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
749 {
750         struct rt6_info *match, *rt0;
751         struct net *net;
752         bool do_rr = false;
753
754         rt0 = fn->rr_ptr;
755         if (!rt0)
756                 fn->rr_ptr = rt0 = fn->leaf;
757
758         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
759                              &do_rr);
760
761         if (do_rr) {
762                 struct rt6_info *next = rt0->dst.rt6_next;
763
764                 /* no entries matched; do round-robin */
765                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
766                         next = fn->leaf;
767
768                 if (next != rt0)
769                         fn->rr_ptr = next;
770         }
771
772         net = dev_net(rt0->dst.dev);
773         return match ? match : net->ipv6.ip6_null_entry;
774 }
775
776 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
777 {
778         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
779 }
780
781 #ifdef CONFIG_IPV6_ROUTE_INFO
782 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
783                   const struct in6_addr *gwaddr)
784 {
785         struct net *net = dev_net(dev);
786         struct route_info *rinfo = (struct route_info *) opt;
787         struct in6_addr prefix_buf, *prefix;
788         unsigned int pref;
789         unsigned long lifetime;
790         struct rt6_info *rt;
791
792         if (len < sizeof(struct route_info)) {
793                 return -EINVAL;
794         }
795
796         /* Sanity check for prefix_len and length */
797         if (rinfo->length > 3) {
798                 return -EINVAL;
799         } else if (rinfo->prefix_len > 128) {
800                 return -EINVAL;
801         } else if (rinfo->prefix_len > 64) {
802                 if (rinfo->length < 2) {
803                         return -EINVAL;
804                 }
805         } else if (rinfo->prefix_len > 0) {
806                 if (rinfo->length < 1) {
807                         return -EINVAL;
808                 }
809         }
810
811         pref = rinfo->route_pref;
812         if (pref == ICMPV6_ROUTER_PREF_INVALID)
813                 return -EINVAL;
814
815         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
816
817         if (rinfo->length == 3)
818                 prefix = (struct in6_addr *)rinfo->prefix;
819         else {
820                 /* this function is safe */
821                 ipv6_addr_prefix(&prefix_buf,
822                                  (struct in6_addr *)rinfo->prefix,
823                                  rinfo->prefix_len);
824                 prefix = &prefix_buf;
825         }
826
827         if (rinfo->prefix_len == 0)
828                 rt = rt6_get_dflt_router(gwaddr, dev);
829         else
830                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
831                                         gwaddr, dev);
832
833         if (rt && !lifetime) {
834                 ip6_del_rt(rt);
835                 rt = NULL;
836         }
837
838         if (!rt && lifetime)
839                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
840                                         dev, pref);
841         else if (rt)
842                 rt->rt6i_flags = RTF_ROUTEINFO |
843                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
844
845         if (rt) {
846                 if (!addrconf_finite_timeout(lifetime))
847                         rt6_clean_expires(rt);
848                 else
849                         rt6_set_expires(rt, jiffies + HZ * lifetime);
850
851                 ip6_rt_put(rt);
852         }
853         return 0;
854 }
855 #endif
856
857 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
858                                         struct in6_addr *saddr)
859 {
860         struct fib6_node *pn;
861         while (1) {
862                 if (fn->fn_flags & RTN_TL_ROOT)
863                         return NULL;
864                 pn = fn->parent;
865                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
866                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
867                 else
868                         fn = pn;
869                 if (fn->fn_flags & RTN_RTINFO)
870                         return fn;
871         }
872 }
873
874 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
875                                              struct fib6_table *table,
876                                              struct flowi6 *fl6, int flags)
877 {
878         struct fib6_node *fn;
879         struct rt6_info *rt;
880
881         read_lock_bh(&table->tb6_lock);
882         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 restart:
884         rt = fn->leaf;
885         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
886         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
887                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
888         if (rt == net->ipv6.ip6_null_entry) {
889                 fn = fib6_backtrack(fn, &fl6->saddr);
890                 if (fn)
891                         goto restart;
892         }
893         dst_use(&rt->dst, jiffies);
894         read_unlock_bh(&table->tb6_lock);
895
896         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
897
898         return rt;
899
900 }
901
902 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903                                     int flags)
904 {
905         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
906 }
907 EXPORT_SYMBOL_GPL(ip6_route_lookup);
908
909 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
910                             const struct in6_addr *saddr, int oif, int strict)
911 {
912         struct flowi6 fl6 = {
913                 .flowi6_oif = oif,
914                 .daddr = *daddr,
915         };
916         struct dst_entry *dst;
917         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918
919         if (saddr) {
920                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
921                 flags |= RT6_LOOKUP_F_HAS_SADDR;
922         }
923
924         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
925         if (dst->error == 0)
926                 return (struct rt6_info *) dst;
927
928         dst_release(dst);
929
930         return NULL;
931 }
932 EXPORT_SYMBOL(rt6_lookup);
933
934 /* ip6_ins_rt is called with FREE table->tb6_lock.
935    It takes new route entry, the addition fails by any reason the
936    route is freed. In any case, if caller does not hold it, it may
937    be destroyed.
938  */
939
940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
941                         struct mx6_config *mxc)
942 {
943         int err;
944         struct fib6_table *table;
945
946         table = rt->rt6i_table;
947         write_lock_bh(&table->tb6_lock);
948         err = fib6_add(&table->tb6_root, rt, info, mxc);
949         write_unlock_bh(&table->tb6_lock);
950
951         return err;
952 }
953
954 int ip6_ins_rt(struct rt6_info *rt)
955 {
956         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
957         struct mx6_config mxc = { .mx = NULL, };
958
959         return __ip6_ins_rt(rt, &info, &mxc);
960 }
961
962 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
963                                            const struct in6_addr *daddr,
964                                            const struct in6_addr *saddr)
965 {
966         struct rt6_info *rt;
967
968         /*
969          *      Clone the route.
970          */
971
972         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
973                 ort = (struct rt6_info *)ort->dst.from;
974
975         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
976
977         if (!rt)
978                 return NULL;
979
980         ip6_rt_copy_init(rt, ort);
981         rt->rt6i_flags |= RTF_CACHE;
982         rt->rt6i_metric = 0;
983         rt->dst.flags |= DST_HOST;
984         rt->rt6i_dst.addr = *daddr;
985         rt->rt6i_dst.plen = 128;
986
987         if (!rt6_is_gw_or_nonexthop(ort)) {
988                 if (ort->rt6i_dst.plen != 128 &&
989                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
990                         rt->rt6i_flags |= RTF_ANYCAST;
991 #ifdef CONFIG_IPV6_SUBTREES
992                 if (rt->rt6i_src.plen && saddr) {
993                         rt->rt6i_src.addr = *saddr;
994                         rt->rt6i_src.plen = 128;
995                 }
996 #endif
997         }
998
999         return rt;
1000 }
1001
1002 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1003 {
1004         struct rt6_info *pcpu_rt;
1005
1006         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1007                                   rt->dst.dev, rt->dst.flags);
1008
1009         if (!pcpu_rt)
1010                 return NULL;
1011         ip6_rt_copy_init(pcpu_rt, rt);
1012         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1013         pcpu_rt->rt6i_flags |= RTF_PCPU;
1014         return pcpu_rt;
1015 }
1016
1017 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1018 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1019 {
1020         struct rt6_info *pcpu_rt, **p;
1021
1022         p = this_cpu_ptr(rt->rt6i_pcpu);
1023         pcpu_rt = *p;
1024
1025         if (pcpu_rt) {
1026                 dst_hold(&pcpu_rt->dst);
1027                 rt6_dst_from_metrics_check(pcpu_rt);
1028         }
1029         return pcpu_rt;
1030 }
1031
1032 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1033 {
1034         struct fib6_table *table = rt->rt6i_table;
1035         struct rt6_info *pcpu_rt, *prev, **p;
1036
1037         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1038         if (!pcpu_rt) {
1039                 struct net *net = dev_net(rt->dst.dev);
1040
1041                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1042                 return net->ipv6.ip6_null_entry;
1043         }
1044
1045         read_lock_bh(&table->tb6_lock);
1046         if (rt->rt6i_pcpu) {
1047                 p = this_cpu_ptr(rt->rt6i_pcpu);
1048                 prev = cmpxchg(p, NULL, pcpu_rt);
1049                 if (prev) {
1050                         /* If someone did it before us, return prev instead */
1051                         dst_destroy(&pcpu_rt->dst);
1052                         pcpu_rt = prev;
1053                 }
1054         } else {
1055                 /* rt has been removed from the fib6 tree
1056                  * before we have a chance to acquire the read_lock.
1057                  * In this case, don't brother to create a pcpu rt
1058                  * since rt is going away anyway.  The next
1059                  * dst_check() will trigger a re-lookup.
1060                  */
1061                 dst_destroy(&pcpu_rt->dst);
1062                 pcpu_rt = rt;
1063         }
1064         dst_hold(&pcpu_rt->dst);
1065         rt6_dst_from_metrics_check(pcpu_rt);
1066         read_unlock_bh(&table->tb6_lock);
1067         return pcpu_rt;
1068 }
1069
1070 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1071                                int oif, struct flowi6 *fl6, int flags)
1072 {
1073         struct fib6_node *fn, *saved_fn;
1074         struct rt6_info *rt;
1075         int strict = 0;
1076
1077         strict |= flags & RT6_LOOKUP_F_IFACE;
1078         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1079         if (net->ipv6.devconf_all->forwarding == 0)
1080                 strict |= RT6_LOOKUP_F_REACHABLE;
1081
1082         read_lock_bh(&table->tb6_lock);
1083
1084         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1085         saved_fn = fn;
1086
1087         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1088                 oif = 0;
1089
1090 redo_rt6_select:
1091         rt = rt6_select(fn, oif, strict);
1092         if (rt->rt6i_nsiblings)
1093                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1094         if (rt == net->ipv6.ip6_null_entry) {
1095                 fn = fib6_backtrack(fn, &fl6->saddr);
1096                 if (fn)
1097                         goto redo_rt6_select;
1098                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1099                         /* also consider unreachable route */
1100                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1101                         fn = saved_fn;
1102                         goto redo_rt6_select;
1103                 }
1104         }
1105
1106
1107         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1108                 dst_use(&rt->dst, jiffies);
1109                 read_unlock_bh(&table->tb6_lock);
1110
1111                 rt6_dst_from_metrics_check(rt);
1112
1113                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1114                 return rt;
1115         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1116                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1117                 /* Create a RTF_CACHE clone which will not be
1118                  * owned by the fib6 tree.  It is for the special case where
1119                  * the daddr in the skb during the neighbor look-up is different
1120                  * from the fl6->daddr used to look-up route here.
1121                  */
1122
1123                 struct rt6_info *uncached_rt;
1124
1125                 dst_use(&rt->dst, jiffies);
1126                 read_unlock_bh(&table->tb6_lock);
1127
1128                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1129                 dst_release(&rt->dst);
1130
1131                 if (uncached_rt)
1132                         rt6_uncached_list_add(uncached_rt);
1133                 else
1134                         uncached_rt = net->ipv6.ip6_null_entry;
1135
1136                 dst_hold(&uncached_rt->dst);
1137
1138                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1139                 return uncached_rt;
1140
1141         } else {
1142                 /* Get a percpu copy */
1143
1144                 struct rt6_info *pcpu_rt;
1145
1146                 rt->dst.lastuse = jiffies;
1147                 rt->dst.__use++;
1148                 pcpu_rt = rt6_get_pcpu_route(rt);
1149
1150                 if (pcpu_rt) {
1151                         read_unlock_bh(&table->tb6_lock);
1152                 } else {
1153                         /* We have to do the read_unlock first
1154                          * because rt6_make_pcpu_route() may trigger
1155                          * ip6_dst_gc() which will take the write_lock.
1156                          */
1157                         dst_hold(&rt->dst);
1158                         read_unlock_bh(&table->tb6_lock);
1159                         pcpu_rt = rt6_make_pcpu_route(rt);
1160                         dst_release(&rt->dst);
1161                 }
1162
1163                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1164                 return pcpu_rt;
1165
1166         }
1167 }
1168 EXPORT_SYMBOL_GPL(ip6_pol_route);
1169
1170 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1171                                             struct flowi6 *fl6, int flags)
1172 {
1173         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1174 }
1175
1176 struct dst_entry *ip6_route_input_lookup(struct net *net,
1177                                          struct net_device *dev,
1178                                          struct flowi6 *fl6, int flags)
1179 {
1180         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1181                 flags |= RT6_LOOKUP_F_IFACE;
1182
1183         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1184 }
1185 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1186
1187 void ip6_route_input(struct sk_buff *skb)
1188 {
1189         const struct ipv6hdr *iph = ipv6_hdr(skb);
1190         struct net *net = dev_net(skb->dev);
1191         int flags = RT6_LOOKUP_F_HAS_SADDR;
1192         struct ip_tunnel_info *tun_info;
1193         struct flowi6 fl6 = {
1194                 .flowi6_iif = skb->dev->ifindex,
1195                 .daddr = iph->daddr,
1196                 .saddr = iph->saddr,
1197                 .flowlabel = ip6_flowinfo(iph),
1198                 .flowi6_mark = skb->mark,
1199                 .flowi6_proto = iph->nexthdr,
1200         };
1201
1202         tun_info = skb_tunnel_info(skb);
1203         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1204                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1205         skb_dst_drop(skb);
1206         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1207 }
1208
1209 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1210                                              struct flowi6 *fl6, int flags)
1211 {
1212         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1213 }
1214
1215 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1216                                          struct flowi6 *fl6, int flags)
1217 {
1218         bool any_src;
1219
1220         if (rt6_need_strict(&fl6->daddr)) {
1221                 struct dst_entry *dst;
1222
1223                 dst = l3mdev_link_scope_lookup(net, fl6);
1224                 if (dst)
1225                         return dst;
1226         }
1227
1228         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1229
1230         any_src = ipv6_addr_any(&fl6->saddr);
1231         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1232             (fl6->flowi6_oif && any_src))
1233                 flags |= RT6_LOOKUP_F_IFACE;
1234
1235         if (!any_src)
1236                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1237         else if (sk)
1238                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1239
1240         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1241 }
1242 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1243
1244 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1245 {
1246         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1247         struct dst_entry *new = NULL;
1248
1249         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1250         if (rt) {
1251                 rt6_info_init(rt);
1252
1253                 new = &rt->dst;
1254                 new->__use = 1;
1255                 new->input = dst_discard;
1256                 new->output = dst_discard_out;
1257
1258                 dst_copy_metrics(new, &ort->dst);
1259                 rt->rt6i_idev = ort->rt6i_idev;
1260                 if (rt->rt6i_idev)
1261                         in6_dev_hold(rt->rt6i_idev);
1262
1263                 rt->rt6i_gateway = ort->rt6i_gateway;
1264                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1265                 rt->rt6i_metric = 0;
1266
1267                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1268 #ifdef CONFIG_IPV6_SUBTREES
1269                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1270 #endif
1271
1272                 dst_free(new);
1273         }
1274
1275         dst_release(dst_orig);
1276         return new ? new : ERR_PTR(-ENOMEM);
1277 }
1278
1279 /*
1280  *      Destination cache support functions
1281  */
1282
1283 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1284 {
1285         if (rt->dst.from &&
1286             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1287                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1288 }
1289
1290 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1291 {
1292         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1293                 return NULL;
1294
1295         if (rt6_check_expired(rt))
1296                 return NULL;
1297
1298         return &rt->dst;
1299 }
1300
1301 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1302 {
1303         if (!__rt6_check_expired(rt) &&
1304             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1305             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1306                 return &rt->dst;
1307         else
1308                 return NULL;
1309 }
1310
1311 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1312 {
1313         struct rt6_info *rt;
1314
1315         rt = (struct rt6_info *) dst;
1316
1317         /* All IPV6 dsts are created with ->obsolete set to the value
1318          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1319          * into this function always.
1320          */
1321
1322         rt6_dst_from_metrics_check(rt);
1323
1324         if (rt->rt6i_flags & RTF_PCPU ||
1325             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1326                 return rt6_dst_from_check(rt, cookie);
1327         else
1328                 return rt6_check(rt, cookie);
1329 }
1330
1331 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1332 {
1333         struct rt6_info *rt = (struct rt6_info *) dst;
1334
1335         if (rt) {
1336                 if (rt->rt6i_flags & RTF_CACHE) {
1337                         if (rt6_check_expired(rt)) {
1338                                 ip6_del_rt(rt);
1339                                 dst = NULL;
1340                         }
1341                 } else {
1342                         dst_release(dst);
1343                         dst = NULL;
1344                 }
1345         }
1346         return dst;
1347 }
1348
1349 static void ip6_link_failure(struct sk_buff *skb)
1350 {
1351         struct rt6_info *rt;
1352
1353         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1354
1355         rt = (struct rt6_info *) skb_dst(skb);
1356         if (rt) {
1357                 if (rt->rt6i_flags & RTF_CACHE) {
1358                         dst_hold(&rt->dst);
1359                         ip6_del_rt(rt);
1360                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1361                         rt->rt6i_node->fn_sernum = -1;
1362                 }
1363         }
1364 }
1365
1366 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1367 {
1368         struct net *net = dev_net(rt->dst.dev);
1369
1370         rt->rt6i_flags |= RTF_MODIFIED;
1371         rt->rt6i_pmtu = mtu;
1372         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1373 }
1374
1375 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1376 {
1377         return !(rt->rt6i_flags & RTF_CACHE) &&
1378                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1379 }
1380
1381 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1382                                  const struct ipv6hdr *iph, u32 mtu)
1383 {
1384         const struct in6_addr *daddr, *saddr;
1385         struct rt6_info *rt6 = (struct rt6_info *)dst;
1386
1387         if (rt6->rt6i_flags & RTF_LOCAL)
1388                 return;
1389
1390         if (dst_metric_locked(dst, RTAX_MTU))
1391                 return;
1392
1393         if (iph) {
1394                 daddr = &iph->daddr;
1395                 saddr = &iph->saddr;
1396         } else if (sk) {
1397                 daddr = &sk->sk_v6_daddr;
1398                 saddr = &inet6_sk(sk)->saddr;
1399         } else {
1400                 daddr = NULL;
1401                 saddr = NULL;
1402         }
1403         dst_confirm_neigh(dst, daddr);
1404         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1405         if (mtu >= dst_mtu(dst))
1406                 return;
1407
1408         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1409                 rt6_do_update_pmtu(rt6, mtu);
1410         } else if (daddr) {
1411                 struct rt6_info *nrt6;
1412
1413                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1414                 if (nrt6) {
1415                         rt6_do_update_pmtu(nrt6, mtu);
1416
1417                         /* ip6_ins_rt(nrt6) will bump the
1418                          * rt6->rt6i_node->fn_sernum
1419                          * which will fail the next rt6_check() and
1420                          * invalidate the sk->sk_dst_cache.
1421                          */
1422                         ip6_ins_rt(nrt6);
1423                 }
1424         }
1425 }
1426
1427 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1428                                struct sk_buff *skb, u32 mtu)
1429 {
1430         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1431 }
1432
1433 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1434                      int oif, u32 mark, kuid_t uid)
1435 {
1436         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1437         struct dst_entry *dst;
1438         struct flowi6 fl6;
1439
1440         memset(&fl6, 0, sizeof(fl6));
1441         fl6.flowi6_oif = oif;
1442         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1443         fl6.daddr = iph->daddr;
1444         fl6.saddr = iph->saddr;
1445         fl6.flowlabel = ip6_flowinfo(iph);
1446         fl6.flowi6_uid = uid;
1447
1448         dst = ip6_route_output(net, NULL, &fl6);
1449         if (!dst->error)
1450                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1451         dst_release(dst);
1452 }
1453 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1454
1455 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1456 {
1457         struct dst_entry *dst;
1458
1459         ip6_update_pmtu(skb, sock_net(sk), mtu,
1460                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1461
1462         dst = __sk_dst_get(sk);
1463         if (!dst || !dst->obsolete ||
1464             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1465                 return;
1466
1467         bh_lock_sock(sk);
1468         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1469                 ip6_datagram_dst_update(sk, false);
1470         bh_unlock_sock(sk);
1471 }
1472 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1473
1474 /* Handle redirects */
1475 struct ip6rd_flowi {
1476         struct flowi6 fl6;
1477         struct in6_addr gateway;
1478 };
1479
1480 static struct rt6_info *__ip6_route_redirect(struct net *net,
1481                                              struct fib6_table *table,
1482                                              struct flowi6 *fl6,
1483                                              int flags)
1484 {
1485         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1486         struct rt6_info *rt;
1487         struct fib6_node *fn;
1488
1489         /* Get the "current" route for this destination and
1490          * check if the redirect has come from appropriate router.
1491          *
1492          * RFC 4861 specifies that redirects should only be
1493          * accepted if they come from the nexthop to the target.
1494          * Due to the way the routes are chosen, this notion
1495          * is a bit fuzzy and one might need to check all possible
1496          * routes.
1497          */
1498
1499         read_lock_bh(&table->tb6_lock);
1500         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1501 restart:
1502         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1503                 if (rt6_check_expired(rt))
1504                         continue;
1505                 if (rt->dst.error)
1506                         break;
1507                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1508                         continue;
1509                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1510                         continue;
1511                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1512                         continue;
1513                 break;
1514         }
1515
1516         if (!rt)
1517                 rt = net->ipv6.ip6_null_entry;
1518         else if (rt->dst.error) {
1519                 rt = net->ipv6.ip6_null_entry;
1520                 goto out;
1521         }
1522
1523         if (rt == net->ipv6.ip6_null_entry) {
1524                 fn = fib6_backtrack(fn, &fl6->saddr);
1525                 if (fn)
1526                         goto restart;
1527         }
1528
1529 out:
1530         dst_hold(&rt->dst);
1531
1532         read_unlock_bh(&table->tb6_lock);
1533
1534         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1535         return rt;
1536 };
1537
1538 static struct dst_entry *ip6_route_redirect(struct net *net,
1539                                         const struct flowi6 *fl6,
1540                                         const struct in6_addr *gateway)
1541 {
1542         int flags = RT6_LOOKUP_F_HAS_SADDR;
1543         struct ip6rd_flowi rdfl;
1544
1545         rdfl.fl6 = *fl6;
1546         rdfl.gateway = *gateway;
1547
1548         return fib6_rule_lookup(net, &rdfl.fl6,
1549                                 flags, __ip6_route_redirect);
1550 }
1551
1552 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1553                   kuid_t uid)
1554 {
1555         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1556         struct dst_entry *dst;
1557         struct flowi6 fl6;
1558
1559         memset(&fl6, 0, sizeof(fl6));
1560         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1561         fl6.flowi6_oif = oif;
1562         fl6.flowi6_mark = mark;
1563         fl6.daddr = iph->daddr;
1564         fl6.saddr = iph->saddr;
1565         fl6.flowlabel = ip6_flowinfo(iph);
1566         fl6.flowi6_uid = uid;
1567
1568         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1569         rt6_do_redirect(dst, NULL, skb);
1570         dst_release(dst);
1571 }
1572 EXPORT_SYMBOL_GPL(ip6_redirect);
1573
1574 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1575                             u32 mark)
1576 {
1577         const struct ipv6hdr *iph = ipv6_hdr(skb);
1578         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1579         struct dst_entry *dst;
1580         struct flowi6 fl6;
1581
1582         memset(&fl6, 0, sizeof(fl6));
1583         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1584         fl6.flowi6_oif = oif;
1585         fl6.flowi6_mark = mark;
1586         fl6.daddr = msg->dest;
1587         fl6.saddr = iph->daddr;
1588         fl6.flowi6_uid = sock_net_uid(net, NULL);
1589
1590         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1591         rt6_do_redirect(dst, NULL, skb);
1592         dst_release(dst);
1593 }
1594
1595 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1596 {
1597         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1598                      sk->sk_uid);
1599 }
1600 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1601
1602 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1603 {
1604         struct net_device *dev = dst->dev;
1605         unsigned int mtu = dst_mtu(dst);
1606         struct net *net = dev_net(dev);
1607
1608         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1609
1610         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1611                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1612
1613         /*
1614          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1615          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1616          * IPV6_MAXPLEN is also valid and means: "any MSS,
1617          * rely only on pmtu discovery"
1618          */
1619         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1620                 mtu = IPV6_MAXPLEN;
1621         return mtu;
1622 }
1623
1624 static unsigned int ip6_mtu(const struct dst_entry *dst)
1625 {
1626         const struct rt6_info *rt = (const struct rt6_info *)dst;
1627         unsigned int mtu = rt->rt6i_pmtu;
1628         struct inet6_dev *idev;
1629
1630         if (mtu)
1631                 goto out;
1632
1633         mtu = dst_metric_raw(dst, RTAX_MTU);
1634         if (mtu)
1635                 goto out;
1636
1637         mtu = IPV6_MIN_MTU;
1638
1639         rcu_read_lock();
1640         idev = __in6_dev_get(dst->dev);
1641         if (idev)
1642                 mtu = idev->cnf.mtu6;
1643         rcu_read_unlock();
1644
1645 out:
1646         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1647
1648         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1649 }
1650
1651 static struct dst_entry *icmp6_dst_gc_list;
1652 static DEFINE_SPINLOCK(icmp6_dst_lock);
1653
1654 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1655                                   struct flowi6 *fl6)
1656 {
1657         struct dst_entry *dst;
1658         struct rt6_info *rt;
1659         struct inet6_dev *idev = in6_dev_get(dev);
1660         struct net *net = dev_net(dev);
1661
1662         if (unlikely(!idev))
1663                 return ERR_PTR(-ENODEV);
1664
1665         rt = ip6_dst_alloc(net, dev, 0);
1666         if (unlikely(!rt)) {
1667                 in6_dev_put(idev);
1668                 dst = ERR_PTR(-ENOMEM);
1669                 goto out;
1670         }
1671
1672         rt->dst.flags |= DST_HOST;
1673         rt->dst.output  = ip6_output;
1674         atomic_set(&rt->dst.__refcnt, 1);
1675         rt->rt6i_gateway  = fl6->daddr;
1676         rt->rt6i_dst.addr = fl6->daddr;
1677         rt->rt6i_dst.plen = 128;
1678         rt->rt6i_idev     = idev;
1679         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1680
1681         spin_lock_bh(&icmp6_dst_lock);
1682         rt->dst.next = icmp6_dst_gc_list;
1683         icmp6_dst_gc_list = &rt->dst;
1684         spin_unlock_bh(&icmp6_dst_lock);
1685
1686         fib6_force_start_gc(net);
1687
1688         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1689
1690 out:
1691         return dst;
1692 }
1693
1694 int icmp6_dst_gc(void)
1695 {
1696         struct dst_entry *dst, **pprev;
1697         int more = 0;
1698
1699         spin_lock_bh(&icmp6_dst_lock);
1700         pprev = &icmp6_dst_gc_list;
1701
1702         while ((dst = *pprev) != NULL) {
1703                 if (!atomic_read(&dst->__refcnt)) {
1704                         *pprev = dst->next;
1705                         dst_free(dst);
1706                 } else {
1707                         pprev = &dst->next;
1708                         ++more;
1709                 }
1710         }
1711
1712         spin_unlock_bh(&icmp6_dst_lock);
1713
1714         return more;
1715 }
1716
1717 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1718                             void *arg)
1719 {
1720         struct dst_entry *dst, **pprev;
1721
1722         spin_lock_bh(&icmp6_dst_lock);
1723         pprev = &icmp6_dst_gc_list;
1724         while ((dst = *pprev) != NULL) {
1725                 struct rt6_info *rt = (struct rt6_info *) dst;
1726                 if (func(rt, arg)) {
1727                         *pprev = dst->next;
1728                         dst_free(dst);
1729                 } else {
1730                         pprev = &dst->next;
1731                 }
1732         }
1733         spin_unlock_bh(&icmp6_dst_lock);
1734 }
1735
1736 static int ip6_dst_gc(struct dst_ops *ops)
1737 {
1738         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1739         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1740         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1741         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1742         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1743         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1744         int entries;
1745
1746         entries = dst_entries_get_fast(ops);
1747         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1748             entries <= rt_max_size)
1749                 goto out;
1750
1751         net->ipv6.ip6_rt_gc_expire++;
1752         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1753         entries = dst_entries_get_slow(ops);
1754         if (entries < ops->gc_thresh)
1755                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1756 out:
1757         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1758         return entries > rt_max_size;
1759 }
1760
1761 static int ip6_convert_metrics(struct mx6_config *mxc,
1762                                const struct fib6_config *cfg)
1763 {
1764         bool ecn_ca = false;
1765         struct nlattr *nla;
1766         int remaining;
1767         u32 *mp;
1768
1769         if (!cfg->fc_mx)
1770                 return 0;
1771
1772         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1773         if (unlikely(!mp))
1774                 return -ENOMEM;
1775
1776         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1777                 int type = nla_type(nla);
1778                 u32 val;
1779
1780                 if (!type)
1781                         continue;
1782                 if (unlikely(type > RTAX_MAX))
1783                         goto err;
1784
1785                 if (type == RTAX_CC_ALGO) {
1786                         char tmp[TCP_CA_NAME_MAX];
1787
1788                         nla_strlcpy(tmp, nla, sizeof(tmp));
1789                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1790                         if (val == TCP_CA_UNSPEC)
1791                                 goto err;
1792                 } else {
1793                         val = nla_get_u32(nla);
1794                 }
1795                 if (type == RTAX_HOPLIMIT && val > 255)
1796                         val = 255;
1797                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1798                         goto err;
1799
1800                 mp[type - 1] = val;
1801                 __set_bit(type - 1, mxc->mx_valid);
1802         }
1803
1804         if (ecn_ca) {
1805                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1806                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1807         }
1808
1809         mxc->mx = mp;
1810         return 0;
1811  err:
1812         kfree(mp);
1813         return -EINVAL;
1814 }
1815
1816 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1817                                             struct fib6_config *cfg,
1818                                             const struct in6_addr *gw_addr)
1819 {
1820         struct flowi6 fl6 = {
1821                 .flowi6_oif = cfg->fc_ifindex,
1822                 .daddr = *gw_addr,
1823                 .saddr = cfg->fc_prefsrc,
1824         };
1825         struct fib6_table *table;
1826         struct rt6_info *rt;
1827         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1828
1829         table = fib6_get_table(net, cfg->fc_table);
1830         if (!table)
1831                 return NULL;
1832
1833         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1834                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1835
1836         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1837
1838         /* if table lookup failed, fall back to full lookup */
1839         if (rt == net->ipv6.ip6_null_entry) {
1840                 ip6_rt_put(rt);
1841                 rt = NULL;
1842         }
1843
1844         return rt;
1845 }
1846
1847 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1848 {
1849         struct net *net = cfg->fc_nlinfo.nl_net;
1850         struct rt6_info *rt = NULL;
1851         struct net_device *dev = NULL;
1852         struct inet6_dev *idev = NULL;
1853         struct fib6_table *table;
1854         int addr_type;
1855         int err = -EINVAL;
1856
1857         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1858                 goto out;
1859 #ifndef CONFIG_IPV6_SUBTREES
1860         if (cfg->fc_src_len)
1861                 goto out;
1862 #endif
1863         if (cfg->fc_ifindex) {
1864                 err = -ENODEV;
1865                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1866                 if (!dev)
1867                         goto out;
1868                 idev = in6_dev_get(dev);
1869                 if (!idev)
1870                         goto out;
1871         }
1872
1873         if (cfg->fc_metric == 0)
1874                 cfg->fc_metric = IP6_RT_PRIO_USER;
1875
1876         err = -ENOBUFS;
1877         if (cfg->fc_nlinfo.nlh &&
1878             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1879                 table = fib6_get_table(net, cfg->fc_table);
1880                 if (!table) {
1881                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1882                         table = fib6_new_table(net, cfg->fc_table);
1883                 }
1884         } else {
1885                 table = fib6_new_table(net, cfg->fc_table);
1886         }
1887
1888         if (!table)
1889                 goto out;
1890
1891         rt = ip6_dst_alloc(net, NULL,
1892                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1893
1894         if (!rt) {
1895                 err = -ENOMEM;
1896                 goto out;
1897         }
1898
1899         if (cfg->fc_flags & RTF_EXPIRES)
1900                 rt6_set_expires(rt, jiffies +
1901                                 clock_t_to_jiffies(cfg->fc_expires));
1902         else
1903                 rt6_clean_expires(rt);
1904
1905         if (cfg->fc_protocol == RTPROT_UNSPEC)
1906                 cfg->fc_protocol = RTPROT_BOOT;
1907         rt->rt6i_protocol = cfg->fc_protocol;
1908
1909         addr_type = ipv6_addr_type(&cfg->fc_dst);
1910
1911         if (addr_type & IPV6_ADDR_MULTICAST)
1912                 rt->dst.input = ip6_mc_input;
1913         else if (cfg->fc_flags & RTF_LOCAL)
1914                 rt->dst.input = ip6_input;
1915         else
1916                 rt->dst.input = ip6_forward;
1917
1918         rt->dst.output = ip6_output;
1919
1920         if (cfg->fc_encap) {
1921                 struct lwtunnel_state *lwtstate;
1922
1923                 err = lwtunnel_build_state(cfg->fc_encap_type,
1924                                            cfg->fc_encap, AF_INET6, cfg,
1925                                            &lwtstate);
1926                 if (err)
1927                         goto out;
1928                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1929                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1930                         rt->dst.lwtstate->orig_output = rt->dst.output;
1931                         rt->dst.output = lwtunnel_output;
1932                 }
1933                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1934                         rt->dst.lwtstate->orig_input = rt->dst.input;
1935                         rt->dst.input = lwtunnel_input;
1936                 }
1937         }
1938
1939         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1940         rt->rt6i_dst.plen = cfg->fc_dst_len;
1941         if (rt->rt6i_dst.plen == 128)
1942                 rt->dst.flags |= DST_HOST;
1943
1944 #ifdef CONFIG_IPV6_SUBTREES
1945         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1946         rt->rt6i_src.plen = cfg->fc_src_len;
1947 #endif
1948
1949         rt->rt6i_metric = cfg->fc_metric;
1950
1951         /* We cannot add true routes via loopback here,
1952            they would result in kernel looping; promote them to reject routes
1953          */
1954         if ((cfg->fc_flags & RTF_REJECT) ||
1955             (dev && (dev->flags & IFF_LOOPBACK) &&
1956              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1957              !(cfg->fc_flags & RTF_LOCAL))) {
1958                 /* hold loopback dev/idev if we haven't done so. */
1959                 if (dev != net->loopback_dev) {
1960                         if (dev) {
1961                                 dev_put(dev);
1962                                 in6_dev_put(idev);
1963                         }
1964                         dev = net->loopback_dev;
1965                         dev_hold(dev);
1966                         idev = in6_dev_get(dev);
1967                         if (!idev) {
1968                                 err = -ENODEV;
1969                                 goto out;
1970                         }
1971                 }
1972                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1973                 switch (cfg->fc_type) {
1974                 case RTN_BLACKHOLE:
1975                         rt->dst.error = -EINVAL;
1976                         rt->dst.output = dst_discard_out;
1977                         rt->dst.input = dst_discard;
1978                         break;
1979                 case RTN_PROHIBIT:
1980                         rt->dst.error = -EACCES;
1981                         rt->dst.output = ip6_pkt_prohibit_out;
1982                         rt->dst.input = ip6_pkt_prohibit;
1983                         break;
1984                 case RTN_THROW:
1985                 case RTN_UNREACHABLE:
1986                 default:
1987                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1988                                         : (cfg->fc_type == RTN_UNREACHABLE)
1989                                         ? -EHOSTUNREACH : -ENETUNREACH;
1990                         rt->dst.output = ip6_pkt_discard_out;
1991                         rt->dst.input = ip6_pkt_discard;
1992                         break;
1993                 }
1994                 goto install_route;
1995         }
1996
1997         if (cfg->fc_flags & RTF_GATEWAY) {
1998                 const struct in6_addr *gw_addr;
1999                 int gwa_type;
2000
2001                 gw_addr = &cfg->fc_gateway;
2002                 gwa_type = ipv6_addr_type(gw_addr);
2003
2004                 /* if gw_addr is local we will fail to detect this in case
2005                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2006                  * will return already-added prefix route via interface that
2007                  * prefix route was assigned to, which might be non-loopback.
2008                  */
2009                 err = -EINVAL;
2010                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2011                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2012                                             dev : NULL, 0, 0))
2013                         goto out;
2014
2015                 rt->rt6i_gateway = *gw_addr;
2016
2017                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2018                         struct rt6_info *grt = NULL;
2019
2020                         /* IPv6 strictly inhibits using not link-local
2021                            addresses as nexthop address.
2022                            Otherwise, router will not able to send redirects.
2023                            It is very good, but in some (rare!) circumstances
2024                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2025                            some exceptions. --ANK
2026                            We allow IPv4-mapped nexthops to support RFC4798-type
2027                            addressing
2028                          */
2029                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2030                                           IPV6_ADDR_MAPPED)))
2031                                 goto out;
2032
2033                         if (cfg->fc_table) {
2034                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2035
2036                                 if (grt) {
2037                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2038                                             (dev && dev != grt->dst.dev)) {
2039                                                 ip6_rt_put(grt);
2040                                                 grt = NULL;
2041                                         }
2042                                 }
2043                         }
2044
2045                         if (!grt)
2046                                 grt = rt6_lookup(net, gw_addr, NULL,
2047                                                  cfg->fc_ifindex, 1);
2048
2049                         err = -EHOSTUNREACH;
2050                         if (!grt)
2051                                 goto out;
2052                         if (dev) {
2053                                 if (dev != grt->dst.dev) {
2054                                         ip6_rt_put(grt);
2055                                         goto out;
2056                                 }
2057                         } else {
2058                                 dev = grt->dst.dev;
2059                                 idev = grt->rt6i_idev;
2060                                 dev_hold(dev);
2061                                 in6_dev_hold(grt->rt6i_idev);
2062                         }
2063                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2064                                 err = 0;
2065                         ip6_rt_put(grt);
2066
2067                         if (err)
2068                                 goto out;
2069                 }
2070                 err = -EINVAL;
2071                 if (!dev || (dev->flags & IFF_LOOPBACK))
2072                         goto out;
2073         }
2074
2075         err = -ENODEV;
2076         if (!dev)
2077                 goto out;
2078
2079         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2080                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2081                         err = -EINVAL;
2082                         goto out;
2083                 }
2084                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2085                 rt->rt6i_prefsrc.plen = 128;
2086         } else
2087                 rt->rt6i_prefsrc.plen = 0;
2088
2089         rt->rt6i_flags = cfg->fc_flags;
2090
2091 install_route:
2092         rt->dst.dev = dev;
2093         rt->rt6i_idev = idev;
2094         rt->rt6i_table = table;
2095
2096         cfg->fc_nlinfo.nl_net = dev_net(dev);
2097
2098         return rt;
2099 out:
2100         if (dev)
2101                 dev_put(dev);
2102         if (idev)
2103                 in6_dev_put(idev);
2104         if (rt)
2105                 dst_free(&rt->dst);
2106
2107         return ERR_PTR(err);
2108 }
2109
2110 int ip6_route_add(struct fib6_config *cfg)
2111 {
2112         struct mx6_config mxc = { .mx = NULL, };
2113         struct rt6_info *rt;
2114         int err;
2115
2116         rt = ip6_route_info_create(cfg);
2117         if (IS_ERR(rt)) {
2118                 err = PTR_ERR(rt);
2119                 rt = NULL;
2120                 goto out;
2121         }
2122
2123         err = ip6_convert_metrics(&mxc, cfg);
2124         if (err)
2125                 goto out;
2126
2127         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2128
2129         kfree(mxc.mx);
2130
2131         return err;
2132 out:
2133         if (rt)
2134                 dst_free(&rt->dst);
2135
2136         return err;
2137 }
2138
2139 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2140 {
2141         int err;
2142         struct fib6_table *table;
2143         struct net *net = dev_net(rt->dst.dev);
2144
2145         if (rt == net->ipv6.ip6_null_entry ||
2146             rt->dst.flags & DST_NOCACHE) {
2147                 err = -ENOENT;
2148                 goto out;
2149         }
2150
2151         table = rt->rt6i_table;
2152         write_lock_bh(&table->tb6_lock);
2153         err = fib6_del(rt, info);
2154         write_unlock_bh(&table->tb6_lock);
2155
2156 out:
2157         ip6_rt_put(rt);
2158         return err;
2159 }
2160
2161 int ip6_del_rt(struct rt6_info *rt)
2162 {
2163         struct nl_info info = {
2164                 .nl_net = dev_net(rt->dst.dev),
2165         };
2166         return __ip6_del_rt(rt, &info);
2167 }
2168
2169 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2170 {
2171         struct nl_info *info = &cfg->fc_nlinfo;
2172         struct net *net = info->nl_net;
2173         struct sk_buff *skb = NULL;
2174         struct fib6_table *table;
2175         int err = -ENOENT;
2176
2177         if (rt == net->ipv6.ip6_null_entry)
2178                 goto out_put;
2179         table = rt->rt6i_table;
2180         write_lock_bh(&table->tb6_lock);
2181
2182         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2183                 struct rt6_info *sibling, *next_sibling;
2184
2185                 /* prefer to send a single notification with all hops */
2186                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2187                 if (skb) {
2188                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2189
2190                         if (rt6_fill_node(net, skb, rt,
2191                                           NULL, NULL, 0, RTM_DELROUTE,
2192                                           info->portid, seq, 0) < 0) {
2193                                 kfree_skb(skb);
2194                                 skb = NULL;
2195                         } else
2196                                 info->skip_notify = 1;
2197                 }
2198
2199                 list_for_each_entry_safe(sibling, next_sibling,
2200                                          &rt->rt6i_siblings,
2201                                          rt6i_siblings) {
2202                         err = fib6_del(sibling, info);
2203                         if (err)
2204                                 goto out_unlock;
2205                 }
2206         }
2207
2208         err = fib6_del(rt, info);
2209 out_unlock:
2210         write_unlock_bh(&table->tb6_lock);
2211 out_put:
2212         ip6_rt_put(rt);
2213
2214         if (skb) {
2215                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2216                             info->nlh, gfp_any());
2217         }
2218         return err;
2219 }
2220
2221 static int ip6_route_del(struct fib6_config *cfg)
2222 {
2223         struct fib6_table *table;
2224         struct fib6_node *fn;
2225         struct rt6_info *rt;
2226         int err = -ESRCH;
2227
2228         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2229         if (!table)
2230                 return err;
2231
2232         read_lock_bh(&table->tb6_lock);
2233
2234         fn = fib6_locate(&table->tb6_root,
2235                          &cfg->fc_dst, cfg->fc_dst_len,
2236                          &cfg->fc_src, cfg->fc_src_len);
2237
2238         if (fn) {
2239                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2240                         if ((rt->rt6i_flags & RTF_CACHE) &&
2241                             !(cfg->fc_flags & RTF_CACHE))
2242                                 continue;
2243                         if (cfg->fc_ifindex &&
2244                             (!rt->dst.dev ||
2245                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2246                                 continue;
2247                         if (cfg->fc_flags & RTF_GATEWAY &&
2248                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2249                                 continue;
2250                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2251                                 continue;
2252                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2253                                 continue;
2254                         dst_hold(&rt->dst);
2255                         read_unlock_bh(&table->tb6_lock);
2256
2257                         /* if gateway was specified only delete the one hop */
2258                         if (cfg->fc_flags & RTF_GATEWAY)
2259                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2260
2261                         return __ip6_del_rt_siblings(rt, cfg);
2262                 }
2263         }
2264         read_unlock_bh(&table->tb6_lock);
2265
2266         return err;
2267 }
2268
2269 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2270 {
2271         struct netevent_redirect netevent;
2272         struct rt6_info *rt, *nrt = NULL;
2273         struct ndisc_options ndopts;
2274         struct inet6_dev *in6_dev;
2275         struct neighbour *neigh;
2276         struct rd_msg *msg;
2277         int optlen, on_link;
2278         u8 *lladdr;
2279
2280         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2281         optlen -= sizeof(*msg);
2282
2283         if (optlen < 0) {
2284                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2285                 return;
2286         }
2287
2288         msg = (struct rd_msg *)icmp6_hdr(skb);
2289
2290         if (ipv6_addr_is_multicast(&msg->dest)) {
2291                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2292                 return;
2293         }
2294
2295         on_link = 0;
2296         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2297                 on_link = 1;
2298         } else if (ipv6_addr_type(&msg->target) !=
2299                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2300                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2301                 return;
2302         }
2303
2304         in6_dev = __in6_dev_get(skb->dev);
2305         if (!in6_dev)
2306                 return;
2307         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2308                 return;
2309
2310         /* RFC2461 8.1:
2311          *      The IP source address of the Redirect MUST be the same as the current
2312          *      first-hop router for the specified ICMP Destination Address.
2313          */
2314
2315         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2316                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2317                 return;
2318         }
2319
2320         lladdr = NULL;
2321         if (ndopts.nd_opts_tgt_lladdr) {
2322                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2323                                              skb->dev);
2324                 if (!lladdr) {
2325                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2326                         return;
2327                 }
2328         }
2329
2330         rt = (struct rt6_info *) dst;
2331         if (rt->rt6i_flags & RTF_REJECT) {
2332                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2333                 return;
2334         }
2335
2336         /* Redirect received -> path was valid.
2337          * Look, redirects are sent only in response to data packets,
2338          * so that this nexthop apparently is reachable. --ANK
2339          */
2340         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2341
2342         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2343         if (!neigh)
2344                 return;
2345
2346         /*
2347          *      We have finally decided to accept it.
2348          */
2349
2350         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2351                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2352                      NEIGH_UPDATE_F_OVERRIDE|
2353                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2354                                      NEIGH_UPDATE_F_ISROUTER)),
2355                      NDISC_REDIRECT, &ndopts);
2356
2357         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2358         if (!nrt)
2359                 goto out;
2360
2361         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2362         if (on_link)
2363                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2364
2365         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2366
2367         if (ip6_ins_rt(nrt))
2368                 goto out;
2369
2370         netevent.old = &rt->dst;
2371         netevent.new = &nrt->dst;
2372         netevent.daddr = &msg->dest;
2373         netevent.neigh = neigh;
2374         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2375
2376         if (rt->rt6i_flags & RTF_CACHE) {
2377                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2378                 ip6_del_rt(rt);
2379         }
2380
2381 out:
2382         neigh_release(neigh);
2383 }
2384
2385 /*
2386  *      Misc support functions
2387  */
2388
2389 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2390 {
2391         BUG_ON(from->dst.from);
2392
2393         rt->rt6i_flags &= ~RTF_EXPIRES;
2394         dst_hold(&from->dst);
2395         rt->dst.from = &from->dst;
2396         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2397 }
2398
2399 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2400 {
2401         rt->dst.input = ort->dst.input;
2402         rt->dst.output = ort->dst.output;
2403         rt->rt6i_dst = ort->rt6i_dst;
2404         rt->dst.error = ort->dst.error;
2405         rt->rt6i_idev = ort->rt6i_idev;
2406         if (rt->rt6i_idev)
2407                 in6_dev_hold(rt->rt6i_idev);
2408         rt->dst.lastuse = jiffies;
2409         rt->rt6i_gateway = ort->rt6i_gateway;
2410         rt->rt6i_flags = ort->rt6i_flags;
2411         rt6_set_from(rt, ort);
2412         rt->rt6i_metric = ort->rt6i_metric;
2413 #ifdef CONFIG_IPV6_SUBTREES
2414         rt->rt6i_src = ort->rt6i_src;
2415 #endif
2416         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2417         rt->rt6i_table = ort->rt6i_table;
2418         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2419 }
2420
2421 #ifdef CONFIG_IPV6_ROUTE_INFO
2422 static struct rt6_info *rt6_get_route_info(struct net *net,
2423                                            const struct in6_addr *prefix, int prefixlen,
2424                                            const struct in6_addr *gwaddr,
2425                                            struct net_device *dev)
2426 {
2427         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2428         int ifindex = dev->ifindex;
2429         struct fib6_node *fn;
2430         struct rt6_info *rt = NULL;
2431         struct fib6_table *table;
2432
2433         table = fib6_get_table(net, tb_id);
2434         if (!table)
2435                 return NULL;
2436
2437         read_lock_bh(&table->tb6_lock);
2438         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2439         if (!fn)
2440                 goto out;
2441
2442         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2443                 if (rt->dst.dev->ifindex != ifindex)
2444                         continue;
2445                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2446                         continue;
2447                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2448                         continue;
2449                 dst_hold(&rt->dst);
2450                 break;
2451         }
2452 out:
2453         read_unlock_bh(&table->tb6_lock);
2454         return rt;
2455 }
2456
2457 static struct rt6_info *rt6_add_route_info(struct net *net,
2458                                            const struct in6_addr *prefix, int prefixlen,
2459                                            const struct in6_addr *gwaddr,
2460                                            struct net_device *dev,
2461                                            unsigned int pref)
2462 {
2463         struct fib6_config cfg = {
2464                 .fc_metric      = IP6_RT_PRIO_USER,
2465                 .fc_ifindex     = dev->ifindex,
2466                 .fc_dst_len     = prefixlen,
2467                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2468                                   RTF_UP | RTF_PREF(pref),
2469                 .fc_nlinfo.portid = 0,
2470                 .fc_nlinfo.nlh = NULL,
2471                 .fc_nlinfo.nl_net = net,
2472         };
2473
2474         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2475         cfg.fc_dst = *prefix;
2476         cfg.fc_gateway = *gwaddr;
2477
2478         /* We should treat it as a default route if prefix length is 0. */
2479         if (!prefixlen)
2480                 cfg.fc_flags |= RTF_DEFAULT;
2481
2482         ip6_route_add(&cfg);
2483
2484         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2485 }
2486 #endif
2487
2488 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2489 {
2490         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2491         struct rt6_info *rt;
2492         struct fib6_table *table;
2493
2494         table = fib6_get_table(dev_net(dev), tb_id);
2495         if (!table)
2496                 return NULL;
2497
2498         read_lock_bh(&table->tb6_lock);
2499         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2500                 if (dev == rt->dst.dev &&
2501                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2502                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2503                         break;
2504         }
2505         if (rt)
2506                 dst_hold(&rt->dst);
2507         read_unlock_bh(&table->tb6_lock);
2508         return rt;
2509 }
2510
2511 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2512                                      struct net_device *dev,
2513                                      unsigned int pref)
2514 {
2515         struct fib6_config cfg = {
2516                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2517                 .fc_metric      = IP6_RT_PRIO_USER,
2518                 .fc_ifindex     = dev->ifindex,
2519                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2520                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2521                 .fc_nlinfo.portid = 0,
2522                 .fc_nlinfo.nlh = NULL,
2523                 .fc_nlinfo.nl_net = dev_net(dev),
2524         };
2525
2526         cfg.fc_gateway = *gwaddr;
2527
2528         if (!ip6_route_add(&cfg)) {
2529                 struct fib6_table *table;
2530
2531                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2532                 if (table)
2533                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2534         }
2535
2536         return rt6_get_dflt_router(gwaddr, dev);
2537 }
2538
2539 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2540 {
2541         struct rt6_info *rt;
2542
2543 restart:
2544         read_lock_bh(&table->tb6_lock);
2545         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2546                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2547                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2548                         dst_hold(&rt->dst);
2549                         read_unlock_bh(&table->tb6_lock);
2550                         ip6_del_rt(rt);
2551                         goto restart;
2552                 }
2553         }
2554         read_unlock_bh(&table->tb6_lock);
2555
2556         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2557 }
2558
2559 void rt6_purge_dflt_routers(struct net *net)
2560 {
2561         struct fib6_table *table;
2562         struct hlist_head *head;
2563         unsigned int h;
2564
2565         rcu_read_lock();
2566
2567         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2568                 head = &net->ipv6.fib_table_hash[h];
2569                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2570                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2571                                 __rt6_purge_dflt_routers(table);
2572                 }
2573         }
2574
2575         rcu_read_unlock();
2576 }
2577
2578 static void rtmsg_to_fib6_config(struct net *net,
2579                                  struct in6_rtmsg *rtmsg,
2580                                  struct fib6_config *cfg)
2581 {
2582         memset(cfg, 0, sizeof(*cfg));
2583
2584         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2585                          : RT6_TABLE_MAIN;
2586         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2587         cfg->fc_metric = rtmsg->rtmsg_metric;
2588         cfg->fc_expires = rtmsg->rtmsg_info;
2589         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2590         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2591         cfg->fc_flags = rtmsg->rtmsg_flags;
2592
2593         cfg->fc_nlinfo.nl_net = net;
2594
2595         cfg->fc_dst = rtmsg->rtmsg_dst;
2596         cfg->fc_src = rtmsg->rtmsg_src;
2597         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2598 }
2599
2600 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2601 {
2602         struct fib6_config cfg;
2603         struct in6_rtmsg rtmsg;
2604         int err;
2605
2606         switch (cmd) {
2607         case SIOCADDRT:         /* Add a route */
2608         case SIOCDELRT:         /* Delete a route */
2609                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2610                         return -EPERM;
2611                 err = copy_from_user(&rtmsg, arg,
2612                                      sizeof(struct in6_rtmsg));
2613                 if (err)
2614                         return -EFAULT;
2615
2616                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2617
2618                 rtnl_lock();
2619                 switch (cmd) {
2620                 case SIOCADDRT:
2621                         err = ip6_route_add(&cfg);
2622                         break;
2623                 case SIOCDELRT:
2624                         err = ip6_route_del(&cfg);
2625                         break;
2626                 default:
2627                         err = -EINVAL;
2628                 }
2629                 rtnl_unlock();
2630
2631                 return err;
2632         }
2633
2634         return -EINVAL;
2635 }
2636
2637 /*
2638  *      Drop the packet on the floor
2639  */
2640
2641 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2642 {
2643         int type;
2644         struct dst_entry *dst = skb_dst(skb);
2645         switch (ipstats_mib_noroutes) {
2646         case IPSTATS_MIB_INNOROUTES:
2647                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2648                 if (type == IPV6_ADDR_ANY) {
2649                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2650                                       IPSTATS_MIB_INADDRERRORS);
2651                         break;
2652                 }
2653                 /* FALLTHROUGH */
2654         case IPSTATS_MIB_OUTNOROUTES:
2655                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2656                               ipstats_mib_noroutes);
2657                 break;
2658         }
2659         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2660         kfree_skb(skb);
2661         return 0;
2662 }
2663
2664 static int ip6_pkt_discard(struct sk_buff *skb)
2665 {
2666         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2667 }
2668
2669 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2670 {
2671         skb->dev = skb_dst(skb)->dev;
2672         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2673 }
2674
2675 static int ip6_pkt_prohibit(struct sk_buff *skb)
2676 {
2677         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2678 }
2679
2680 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2681 {
2682         skb->dev = skb_dst(skb)->dev;
2683         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2684 }
2685
2686 /*
2687  *      Allocate a dst for local (unicast / anycast) address.
2688  */
2689
2690 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2691                                     const struct in6_addr *addr,
2692                                     bool anycast)
2693 {
2694         u32 tb_id;
2695         struct net *net = dev_net(idev->dev);
2696         struct net_device *dev = net->loopback_dev;
2697         struct rt6_info *rt;
2698
2699         /* use L3 Master device as loopback for host routes if device
2700          * is enslaved and address is not link local or multicast
2701          */
2702         if (!rt6_need_strict(addr))
2703                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2704
2705         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2706         if (!rt)
2707                 return ERR_PTR(-ENOMEM);
2708
2709         in6_dev_hold(idev);
2710
2711         rt->dst.flags |= DST_HOST;
2712         rt->dst.input = ip6_input;
2713         rt->dst.output = ip6_output;
2714         rt->rt6i_idev = idev;
2715
2716         rt->rt6i_protocol = RTPROT_KERNEL;
2717         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2718         if (anycast)
2719                 rt->rt6i_flags |= RTF_ANYCAST;
2720         else
2721                 rt->rt6i_flags |= RTF_LOCAL;
2722
2723         rt->rt6i_gateway  = *addr;
2724         rt->rt6i_dst.addr = *addr;
2725         rt->rt6i_dst.plen = 128;
2726         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2727         rt->rt6i_table = fib6_get_table(net, tb_id);
2728         rt->dst.flags |= DST_NOCACHE;
2729
2730         atomic_set(&rt->dst.__refcnt, 1);
2731
2732         return rt;
2733 }
2734
2735 /* remove deleted ip from prefsrc entries */
2736 struct arg_dev_net_ip {
2737         struct net_device *dev;
2738         struct net *net;
2739         struct in6_addr *addr;
2740 };
2741
2742 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2743 {
2744         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2745         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2746         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2747
2748         if (((void *)rt->dst.dev == dev || !dev) &&
2749             rt != net->ipv6.ip6_null_entry &&
2750             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2751                 /* remove prefsrc entry */
2752                 rt->rt6i_prefsrc.plen = 0;
2753         }
2754         return 0;
2755 }
2756
2757 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2758 {
2759         struct net *net = dev_net(ifp->idev->dev);
2760         struct arg_dev_net_ip adni = {
2761                 .dev = ifp->idev->dev,
2762                 .net = net,
2763                 .addr = &ifp->addr,
2764         };
2765         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2766 }
2767
2768 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2769 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2770
2771 /* Remove routers and update dst entries when gateway turn into host. */
2772 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2773 {
2774         struct in6_addr *gateway = (struct in6_addr *)arg;
2775
2776         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2777              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2778              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2779                 return -1;
2780         }
2781         return 0;
2782 }
2783
2784 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2785 {
2786         fib6_clean_all(net, fib6_clean_tohost, gateway);
2787 }
2788
2789 struct arg_dev_net {
2790         struct net_device *dev;
2791         struct net *net;
2792 };
2793
2794 /* called with write lock held for table with rt */
2795 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2796 {
2797         const struct arg_dev_net *adn = arg;
2798         const struct net_device *dev = adn->dev;
2799
2800         if ((rt->dst.dev == dev || !dev) &&
2801             rt != adn->net->ipv6.ip6_null_entry &&
2802             (rt->rt6i_nsiblings == 0 ||
2803              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2804                 return -1;
2805
2806         return 0;
2807 }
2808
2809 void rt6_ifdown(struct net *net, struct net_device *dev)
2810 {
2811         struct arg_dev_net adn = {
2812                 .dev = dev,
2813                 .net = net,
2814         };
2815
2816         fib6_clean_all(net, fib6_ifdown, &adn);
2817         icmp6_clean_all(fib6_ifdown, &adn);
2818         if (dev)
2819                 rt6_uncached_list_flush_dev(net, dev);
2820 }
2821
2822 struct rt6_mtu_change_arg {
2823         struct net_device *dev;
2824         unsigned int mtu;
2825 };
2826
2827 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2828 {
2829         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2830         struct inet6_dev *idev;
2831
2832         /* In IPv6 pmtu discovery is not optional,
2833            so that RTAX_MTU lock cannot disable it.
2834            We still use this lock to block changes
2835            caused by addrconf/ndisc.
2836         */
2837
2838         idev = __in6_dev_get(arg->dev);
2839         if (!idev)
2840                 return 0;
2841
2842         /* For administrative MTU increase, there is no way to discover
2843            IPv6 PMTU increase, so PMTU increase should be updated here.
2844            Since RFC 1981 doesn't include administrative MTU increase
2845            update PMTU increase is a MUST. (i.e. jumbo frame)
2846          */
2847         /*
2848            If new MTU is less than route PMTU, this new MTU will be the
2849            lowest MTU in the path, update the route PMTU to reflect PMTU
2850            decreases; if new MTU is greater than route PMTU, and the
2851            old MTU is the lowest MTU in the path, update the route PMTU
2852            to reflect the increase. In this case if the other nodes' MTU
2853            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2854            PMTU discovery.
2855          */
2856         if (rt->dst.dev == arg->dev &&
2857             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2858             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2859                 if (rt->rt6i_flags & RTF_CACHE) {
2860                         /* For RTF_CACHE with rt6i_pmtu == 0
2861                          * (i.e. a redirected route),
2862                          * the metrics of its rt->dst.from has already
2863                          * been updated.
2864                          */
2865                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2866                                 rt->rt6i_pmtu = arg->mtu;
2867                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2868                            (dst_mtu(&rt->dst) < arg->mtu &&
2869                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2870                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2871                 }
2872         }
2873         return 0;
2874 }
2875
2876 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2877 {
2878         struct rt6_mtu_change_arg arg = {
2879                 .dev = dev,
2880                 .mtu = mtu,
2881         };
2882
2883         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2884 }
2885
2886 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2887         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2888         [RTA_OIF]               = { .type = NLA_U32 },
2889         [RTA_IIF]               = { .type = NLA_U32 },
2890         [RTA_PRIORITY]          = { .type = NLA_U32 },
2891         [RTA_METRICS]           = { .type = NLA_NESTED },
2892         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2893         [RTA_PREF]              = { .type = NLA_U8 },
2894         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2895         [RTA_ENCAP]             = { .type = NLA_NESTED },
2896         [RTA_EXPIRES]           = { .type = NLA_U32 },
2897         [RTA_UID]               = { .type = NLA_U32 },
2898         [RTA_MARK]              = { .type = NLA_U32 },
2899 };
2900
2901 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2902                               struct fib6_config *cfg)
2903 {
2904         struct rtmsg *rtm;
2905         struct nlattr *tb[RTA_MAX+1];
2906         unsigned int pref;
2907         int err;
2908
2909         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2910                           NULL);
2911         if (err < 0)
2912                 goto errout;
2913
2914         err = -EINVAL;
2915         rtm = nlmsg_data(nlh);
2916         memset(cfg, 0, sizeof(*cfg));
2917
2918         cfg->fc_table = rtm->rtm_table;
2919         cfg->fc_dst_len = rtm->rtm_dst_len;
2920         cfg->fc_src_len = rtm->rtm_src_len;
2921         cfg->fc_flags = RTF_UP;
2922         cfg->fc_protocol = rtm->rtm_protocol;
2923         cfg->fc_type = rtm->rtm_type;
2924
2925         if (rtm->rtm_type == RTN_UNREACHABLE ||
2926             rtm->rtm_type == RTN_BLACKHOLE ||
2927             rtm->rtm_type == RTN_PROHIBIT ||
2928             rtm->rtm_type == RTN_THROW)
2929                 cfg->fc_flags |= RTF_REJECT;
2930
2931         if (rtm->rtm_type == RTN_LOCAL)
2932                 cfg->fc_flags |= RTF_LOCAL;
2933
2934         if (rtm->rtm_flags & RTM_F_CLONED)
2935                 cfg->fc_flags |= RTF_CACHE;
2936
2937         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2938         cfg->fc_nlinfo.nlh = nlh;
2939         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2940
2941         if (tb[RTA_GATEWAY]) {
2942                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2943                 cfg->fc_flags |= RTF_GATEWAY;
2944         }
2945
2946         if (tb[RTA_DST]) {
2947                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2948
2949                 if (nla_len(tb[RTA_DST]) < plen)
2950                         goto errout;
2951
2952                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2953         }
2954
2955         if (tb[RTA_SRC]) {
2956                 int plen = (rtm->rtm_src_len + 7) >> 3;
2957
2958                 if (nla_len(tb[RTA_SRC]) < plen)
2959                         goto errout;
2960
2961                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2962         }
2963
2964         if (tb[RTA_PREFSRC])
2965                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2966
2967         if (tb[RTA_OIF])
2968                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2969
2970         if (tb[RTA_PRIORITY])
2971                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2972
2973         if (tb[RTA_METRICS]) {
2974                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2975                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2976         }
2977
2978         if (tb[RTA_TABLE])
2979                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2980
2981         if (tb[RTA_MULTIPATH]) {
2982                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2983                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2984
2985                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2986                                                      cfg->fc_mp_len);
2987                 if (err < 0)
2988                         goto errout;
2989         }
2990
2991         if (tb[RTA_PREF]) {
2992                 pref = nla_get_u8(tb[RTA_PREF]);
2993                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2994                     pref != ICMPV6_ROUTER_PREF_HIGH)
2995                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2996                 cfg->fc_flags |= RTF_PREF(pref);
2997         }
2998
2999         if (tb[RTA_ENCAP])
3000                 cfg->fc_encap = tb[RTA_ENCAP];
3001
3002         if (tb[RTA_ENCAP_TYPE]) {
3003                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3004
3005                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
3006                 if (err < 0)
3007                         goto errout;
3008         }
3009
3010         if (tb[RTA_EXPIRES]) {
3011                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3012
3013                 if (addrconf_finite_timeout(timeout)) {
3014                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3015                         cfg->fc_flags |= RTF_EXPIRES;
3016                 }
3017         }
3018
3019         err = 0;
3020 errout:
3021         return err;
3022 }
3023
3024 struct rt6_nh {
3025         struct rt6_info *rt6_info;
3026         struct fib6_config r_cfg;
3027         struct mx6_config mxc;
3028         struct list_head next;
3029 };
3030
3031 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3032 {
3033         struct rt6_nh *nh;
3034
3035         list_for_each_entry(nh, rt6_nh_list, next) {
3036                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3037                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3038                         nh->r_cfg.fc_ifindex);
3039         }
3040 }
3041
3042 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3043                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3044 {
3045         struct rt6_nh *nh;
3046         struct rt6_info *rtnh;
3047         int err = -EEXIST;
3048
3049         list_for_each_entry(nh, rt6_nh_list, next) {
3050                 /* check if rt6_info already exists */
3051                 rtnh = nh->rt6_info;
3052
3053                 if (rtnh->dst.dev == rt->dst.dev &&
3054                     rtnh->rt6i_idev == rt->rt6i_idev &&
3055                     ipv6_addr_equal(&rtnh->rt6i_gateway,
3056                                     &rt->rt6i_gateway))
3057                         return err;
3058         }
3059
3060         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3061         if (!nh)
3062                 return -ENOMEM;
3063         nh->rt6_info = rt;
3064         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3065         if (err) {
3066                 kfree(nh);
3067                 return err;
3068         }
3069         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3070         list_add_tail(&nh->next, rt6_nh_list);
3071
3072         return 0;
3073 }
3074
3075 static void ip6_route_mpath_notify(struct rt6_info *rt,
3076                                    struct rt6_info *rt_last,
3077                                    struct nl_info *info,
3078                                    __u16 nlflags)
3079 {
3080         /* if this is an APPEND route, then rt points to the first route
3081          * inserted and rt_last points to last route inserted. Userspace
3082          * wants a consistent dump of the route which starts at the first
3083          * nexthop. Since sibling routes are always added at the end of
3084          * the list, find the first sibling of the last route appended
3085          */
3086         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3087                 rt = list_first_entry(&rt_last->rt6i_siblings,
3088                                       struct rt6_info,
3089                                       rt6i_siblings);
3090         }
3091
3092         if (rt)
3093                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3094 }
3095
3096 static int ip6_route_multipath_add(struct fib6_config *cfg)
3097 {
3098         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3099         struct nl_info *info = &cfg->fc_nlinfo;
3100         struct fib6_config r_cfg;
3101         struct rtnexthop *rtnh;
3102         struct rt6_info *rt;
3103         struct rt6_nh *err_nh;
3104         struct rt6_nh *nh, *nh_safe;
3105         __u16 nlflags;
3106         int remaining;
3107         int attrlen;
3108         int err = 1;
3109         int nhn = 0;
3110         int replace = (cfg->fc_nlinfo.nlh &&
3111                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3112         LIST_HEAD(rt6_nh_list);
3113
3114         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3115         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3116                 nlflags |= NLM_F_APPEND;
3117
3118         remaining = cfg->fc_mp_len;
3119         rtnh = (struct rtnexthop *)cfg->fc_mp;
3120
3121         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3122          * rt6_info structs per nexthop
3123          */
3124         while (rtnh_ok(rtnh, remaining)) {
3125                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3126                 if (rtnh->rtnh_ifindex)
3127                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3128
3129                 attrlen = rtnh_attrlen(rtnh);
3130                 if (attrlen > 0) {
3131                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3132
3133                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3134                         if (nla) {
3135                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3136                                 r_cfg.fc_flags |= RTF_GATEWAY;
3137                         }
3138                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3139                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3140                         if (nla)
3141                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3142                 }
3143
3144                 rt = ip6_route_info_create(&r_cfg);
3145                 if (IS_ERR(rt)) {
3146                         err = PTR_ERR(rt);
3147                         rt = NULL;
3148                         goto cleanup;
3149                 }
3150
3151                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3152                 if (err) {
3153                         dst_free(&rt->dst);
3154                         goto cleanup;
3155                 }
3156
3157                 rtnh = rtnh_next(rtnh, &remaining);
3158         }
3159
3160         /* for add and replace send one notification with all nexthops.
3161          * Skip the notification in fib6_add_rt2node and send one with
3162          * the full route when done
3163          */
3164         info->skip_notify = 1;
3165
3166         err_nh = NULL;
3167         list_for_each_entry(nh, &rt6_nh_list, next) {
3168                 rt_last = nh->rt6_info;
3169                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
3170                 /* save reference to first route for notification */
3171                 if (!rt_notif && !err)
3172                         rt_notif = nh->rt6_info;
3173
3174                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3175                 nh->rt6_info = NULL;
3176                 if (err) {
3177                         if (replace && nhn)
3178                                 ip6_print_replace_route_err(&rt6_nh_list);
3179                         err_nh = nh;
3180                         goto add_errout;
3181                 }
3182
3183                 /* Because each route is added like a single route we remove
3184                  * these flags after the first nexthop: if there is a collision,
3185                  * we have already failed to add the first nexthop:
3186                  * fib6_add_rt2node() has rejected it; when replacing, old
3187                  * nexthops have been replaced by first new, the rest should
3188                  * be added to it.
3189                  */
3190                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3191                                                      NLM_F_REPLACE);
3192                 nhn++;
3193         }
3194
3195         /* success ... tell user about new route */
3196         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3197         goto cleanup;
3198
3199 add_errout:
3200         /* send notification for routes that were added so that
3201          * the delete notifications sent by ip6_route_del are
3202          * coherent
3203          */
3204         if (rt_notif)
3205                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3206
3207         /* Delete routes that were already added */
3208         list_for_each_entry(nh, &rt6_nh_list, next) {
3209                 if (err_nh == nh)
3210                         break;
3211                 ip6_route_del(&nh->r_cfg);
3212         }
3213
3214 cleanup:
3215         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3216                 if (nh->rt6_info)
3217                         dst_free(&nh->rt6_info->dst);
3218                 kfree(nh->mxc.mx);
3219                 list_del(&nh->next);
3220                 kfree(nh);
3221         }
3222
3223         return err;
3224 }
3225
3226 static int ip6_route_multipath_del(struct fib6_config *cfg)
3227 {
3228         struct fib6_config r_cfg;
3229         struct rtnexthop *rtnh;
3230         int remaining;
3231         int attrlen;
3232         int err = 1, last_err = 0;
3233
3234         remaining = cfg->fc_mp_len;
3235         rtnh = (struct rtnexthop *)cfg->fc_mp;
3236
3237         /* Parse a Multipath Entry */
3238         while (rtnh_ok(rtnh, remaining)) {
3239                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3240                 if (rtnh->rtnh_ifindex)
3241                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3242
3243                 attrlen = rtnh_attrlen(rtnh);
3244                 if (attrlen > 0) {
3245                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3246
3247                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3248                         if (nla) {
3249                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3250                                 r_cfg.fc_flags |= RTF_GATEWAY;
3251                         }
3252                 }
3253                 err = ip6_route_del(&r_cfg);
3254                 if (err)
3255                         last_err = err;
3256
3257                 rtnh = rtnh_next(rtnh, &remaining);
3258         }
3259
3260         return last_err;
3261 }
3262
3263 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3264                               struct netlink_ext_ack *extack)
3265 {
3266         struct fib6_config cfg;
3267         int err;
3268
3269         err = rtm_to_fib6_config(skb, nlh, &cfg);
3270         if (err < 0)
3271                 return err;
3272
3273         if (cfg.fc_mp)
3274                 return ip6_route_multipath_del(&cfg);
3275         else {
3276                 cfg.fc_delete_all_nh = 1;
3277                 return ip6_route_del(&cfg);
3278         }
3279 }
3280
3281 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3282                               struct netlink_ext_ack *extack)
3283 {
3284         struct fib6_config cfg;
3285         int err;
3286
3287         err = rtm_to_fib6_config(skb, nlh, &cfg);
3288         if (err < 0)
3289                 return err;
3290
3291         if (cfg.fc_mp)
3292                 return ip6_route_multipath_add(&cfg);
3293         else
3294                 return ip6_route_add(&cfg);
3295 }
3296
3297 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3298 {
3299         int nexthop_len = 0;
3300
3301         if (rt->rt6i_nsiblings) {
3302                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3303                             + NLA_ALIGN(sizeof(struct rtnexthop))
3304                             + nla_total_size(16) /* RTA_GATEWAY */
3305                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3306
3307                 nexthop_len *= rt->rt6i_nsiblings;
3308         }
3309
3310         return NLMSG_ALIGN(sizeof(struct rtmsg))
3311                + nla_total_size(16) /* RTA_SRC */
3312                + nla_total_size(16) /* RTA_DST */
3313                + nla_total_size(16) /* RTA_GATEWAY */
3314                + nla_total_size(16) /* RTA_PREFSRC */
3315                + nla_total_size(4) /* RTA_TABLE */
3316                + nla_total_size(4) /* RTA_IIF */
3317                + nla_total_size(4) /* RTA_OIF */
3318                + nla_total_size(4) /* RTA_PRIORITY */
3319                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3320                + nla_total_size(sizeof(struct rta_cacheinfo))
3321                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3322                + nla_total_size(1) /* RTA_PREF */
3323                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3324                + nexthop_len;
3325 }
3326
3327 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3328                             unsigned int *flags, bool skip_oif)
3329 {
3330         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3331                 *flags |= RTNH_F_LINKDOWN;
3332                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3333                         *flags |= RTNH_F_DEAD;
3334         }
3335
3336         if (rt->rt6i_flags & RTF_GATEWAY) {
3337                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3338                         goto nla_put_failure;
3339         }
3340
3341         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3342         if (!skip_oif && rt->dst.dev &&
3343             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3344                 goto nla_put_failure;
3345
3346         if (rt->dst.lwtstate &&
3347             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3348                 goto nla_put_failure;
3349
3350         return 0;
3351
3352 nla_put_failure:
3353         return -EMSGSIZE;
3354 }
3355
3356 /* add multipath next hop */
3357 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3358 {
3359         struct rtnexthop *rtnh;
3360         unsigned int flags = 0;
3361
3362         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3363         if (!rtnh)
3364                 goto nla_put_failure;
3365
3366         rtnh->rtnh_hops = 0;
3367         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3368
3369         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3370                 goto nla_put_failure;
3371
3372         rtnh->rtnh_flags = flags;
3373
3374         /* length of rtnetlink header + attributes */
3375         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3376
3377         return 0;
3378
3379 nla_put_failure:
3380         return -EMSGSIZE;
3381 }
3382
3383 static int rt6_fill_node(struct net *net,
3384                          struct sk_buff *skb, struct rt6_info *rt,
3385                          struct in6_addr *dst, struct in6_addr *src,
3386                          int iif, int type, u32 portid, u32 seq,
3387                          unsigned int flags)
3388 {
3389         u32 metrics[RTAX_MAX];
3390         struct rtmsg *rtm;
3391         struct nlmsghdr *nlh;
3392         long expires;
3393         u32 table;
3394
3395         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3396         if (!nlh)
3397                 return -EMSGSIZE;
3398
3399         rtm = nlmsg_data(nlh);
3400         rtm->rtm_family = AF_INET6;
3401         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3402         rtm->rtm_src_len = rt->rt6i_src.plen;
3403         rtm->rtm_tos = 0;
3404         if (rt->rt6i_table)
3405                 table = rt->rt6i_table->tb6_id;
3406         else
3407                 table = RT6_TABLE_UNSPEC;
3408         rtm->rtm_table = table;
3409         if (nla_put_u32(skb, RTA_TABLE, table))
3410                 goto nla_put_failure;
3411         if (rt->rt6i_flags & RTF_REJECT) {
3412                 switch (rt->dst.error) {
3413                 case -EINVAL:
3414                         rtm->rtm_type = RTN_BLACKHOLE;
3415                         break;
3416                 case -EACCES:
3417                         rtm->rtm_type = RTN_PROHIBIT;
3418                         break;
3419                 case -EAGAIN:
3420                         rtm->rtm_type = RTN_THROW;
3421                         break;
3422                 default:
3423                         rtm->rtm_type = RTN_UNREACHABLE;
3424                         break;
3425                 }
3426         }
3427         else if (rt->rt6i_flags & RTF_LOCAL)
3428                 rtm->rtm_type = RTN_LOCAL;
3429         else if (rt->rt6i_flags & RTF_ANYCAST)
3430                 rtm->rtm_type = RTN_ANYCAST;
3431         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3432                 rtm->rtm_type = RTN_LOCAL;
3433         else
3434                 rtm->rtm_type = RTN_UNICAST;
3435         rtm->rtm_flags = 0;
3436         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3437         rtm->rtm_protocol = rt->rt6i_protocol;
3438         if (rt->rt6i_flags & RTF_DYNAMIC)
3439                 rtm->rtm_protocol = RTPROT_REDIRECT;
3440         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3441                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3442                         rtm->rtm_protocol = RTPROT_RA;
3443                 else
3444                         rtm->rtm_protocol = RTPROT_KERNEL;
3445         }
3446
3447         if (rt->rt6i_flags & RTF_CACHE)
3448                 rtm->rtm_flags |= RTM_F_CLONED;
3449
3450         if (dst) {
3451                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3452                         goto nla_put_failure;
3453                 rtm->rtm_dst_len = 128;
3454         } else if (rtm->rtm_dst_len)
3455                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3456                         goto nla_put_failure;
3457 #ifdef CONFIG_IPV6_SUBTREES
3458         if (src) {
3459                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3460                         goto nla_put_failure;
3461                 rtm->rtm_src_len = 128;
3462         } else if (rtm->rtm_src_len &&
3463                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3464                 goto nla_put_failure;
3465 #endif
3466         if (iif) {
3467 #ifdef CONFIG_IPV6_MROUTE
3468                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3469                         int err = ip6mr_get_route(net, skb, rtm, portid);
3470
3471                         if (err == 0)
3472                                 return 0;
3473                         if (err < 0)
3474                                 goto nla_put_failure;
3475                 } else
3476 #endif
3477                         if (nla_put_u32(skb, RTA_IIF, iif))
3478                                 goto nla_put_failure;
3479         } else if (dst) {
3480                 struct in6_addr saddr_buf;
3481                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3482                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3483                         goto nla_put_failure;
3484         }
3485
3486         if (rt->rt6i_prefsrc.plen) {
3487                 struct in6_addr saddr_buf;
3488                 saddr_buf = rt->rt6i_prefsrc.addr;
3489                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3490                         goto nla_put_failure;
3491         }
3492
3493         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3494         if (rt->rt6i_pmtu)
3495                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3496         if (rtnetlink_put_metrics(skb, metrics) < 0)
3497                 goto nla_put_failure;
3498
3499         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3500                 goto nla_put_failure;
3501
3502         /* For multipath routes, walk the siblings list and add
3503          * each as a nexthop within RTA_MULTIPATH.
3504          */
3505         if (rt->rt6i_nsiblings) {
3506                 struct rt6_info *sibling, *next_sibling;
3507                 struct nlattr *mp;
3508
3509                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3510                 if (!mp)
3511                         goto nla_put_failure;
3512
3513                 if (rt6_add_nexthop(skb, rt) < 0)
3514                         goto nla_put_failure;
3515
3516                 list_for_each_entry_safe(sibling, next_sibling,
3517                                          &rt->rt6i_siblings, rt6i_siblings) {
3518                         if (rt6_add_nexthop(skb, sibling) < 0)
3519                                 goto nla_put_failure;
3520                 }
3521
3522                 nla_nest_end(skb, mp);
3523         } else {
3524                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3525                         goto nla_put_failure;
3526         }
3527
3528         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3529
3530         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3531                 goto nla_put_failure;
3532
3533         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3534                 goto nla_put_failure;
3535
3536
3537         nlmsg_end(skb, nlh);
3538         return 0;
3539
3540 nla_put_failure:
3541         nlmsg_cancel(skb, nlh);
3542         return -EMSGSIZE;
3543 }
3544
3545 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3546 {
3547         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3548         struct net *net = arg->net;
3549
3550         if (rt == net->ipv6.ip6_null_entry)
3551                 return 0;
3552
3553         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3554                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3555
3556                 /* user wants prefix routes only */
3557                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3558                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3559                         /* success since this is not a prefix route */
3560                         return 1;
3561                 }
3562         }
3563
3564         return rt6_fill_node(net,
3565                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3566                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3567                      NLM_F_MULTI);
3568 }
3569
3570 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3571                               struct netlink_ext_ack *extack)
3572 {
3573         struct net *net = sock_net(in_skb->sk);
3574         struct nlattr *tb[RTA_MAX+1];
3575         struct rt6_info *rt;
3576         struct sk_buff *skb;
3577         struct rtmsg *rtm;
3578         struct flowi6 fl6;
3579         int err, iif = 0, oif = 0;
3580
3581         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3582                           extack);
3583         if (err < 0)
3584                 goto errout;
3585
3586         err = -EINVAL;
3587         memset(&fl6, 0, sizeof(fl6));
3588         rtm = nlmsg_data(nlh);
3589         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3590
3591         if (tb[RTA_SRC]) {
3592                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3593                         goto errout;
3594
3595                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3596         }
3597
3598         if (tb[RTA_DST]) {
3599                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3600                         goto errout;
3601
3602                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3603         }
3604
3605         if (tb[RTA_IIF])
3606                 iif = nla_get_u32(tb[RTA_IIF]);
3607
3608         if (tb[RTA_OIF])
3609                 oif = nla_get_u32(tb[RTA_OIF]);
3610
3611         if (tb[RTA_MARK])
3612                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3613
3614         if (tb[RTA_UID])
3615                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3616                                            nla_get_u32(tb[RTA_UID]));
3617         else
3618                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3619
3620         if (iif) {
3621                 struct net_device *dev;
3622                 int flags = 0;
3623
3624                 dev = __dev_get_by_index(net, iif);
3625                 if (!dev) {
3626                         err = -ENODEV;
3627                         goto errout;
3628                 }
3629
3630                 fl6.flowi6_iif = iif;
3631
3632                 if (!ipv6_addr_any(&fl6.saddr))
3633                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3634
3635                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3636                                                                flags);
3637         } else {
3638                 fl6.flowi6_oif = oif;
3639
3640                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3641         }
3642
3643         if (rt == net->ipv6.ip6_null_entry) {
3644                 err = rt->dst.error;
3645                 ip6_rt_put(rt);
3646                 goto errout;
3647         }
3648
3649         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3650         if (!skb) {
3651                 ip6_rt_put(rt);
3652                 err = -ENOBUFS;
3653                 goto errout;
3654         }
3655
3656         skb_dst_set(skb, &rt->dst);
3657
3658         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3659                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3660                             nlh->nlmsg_seq, 0);
3661         if (err < 0) {
3662                 kfree_skb(skb);
3663                 goto errout;
3664         }
3665
3666         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3667 errout:
3668         return err;
3669 }
3670
3671 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3672                      unsigned int nlm_flags)
3673 {
3674         struct sk_buff *skb;
3675         struct net *net = info->nl_net;
3676         u32 seq;
3677         int err;
3678
3679         err = -ENOBUFS;
3680         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3681
3682         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3683         if (!skb)
3684                 goto errout;
3685
3686         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3687                                 event, info->portid, seq, nlm_flags);
3688         if (err < 0) {
3689                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3690                 WARN_ON(err == -EMSGSIZE);
3691                 kfree_skb(skb);
3692                 goto errout;
3693         }
3694         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3695                     info->nlh, gfp_any());
3696         return;
3697 errout:
3698         if (err < 0)
3699                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3700 }
3701
3702 static int ip6_route_dev_notify(struct notifier_block *this,
3703                                 unsigned long event, void *ptr)
3704 {
3705         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3706         struct net *net = dev_net(dev);
3707
3708         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3709                 net->ipv6.ip6_null_entry->dst.dev = dev;
3710                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3711 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3712                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3713                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3714                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3715                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3716 #endif
3717         }
3718
3719         return NOTIFY_OK;
3720 }
3721
3722 /*
3723  *      /proc
3724  */
3725
3726 #ifdef CONFIG_PROC_FS
3727
3728 static const struct file_operations ipv6_route_proc_fops = {
3729         .owner          = THIS_MODULE,
3730         .open           = ipv6_route_open,
3731         .read           = seq_read,
3732         .llseek         = seq_lseek,
3733         .release        = seq_release_net,
3734 };
3735
3736 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3737 {
3738         struct net *net = (struct net *)seq->private;
3739         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3740                    net->ipv6.rt6_stats->fib_nodes,
3741                    net->ipv6.rt6_stats->fib_route_nodes,
3742                    net->ipv6.rt6_stats->fib_rt_alloc,
3743                    net->ipv6.rt6_stats->fib_rt_entries,
3744                    net->ipv6.rt6_stats->fib_rt_cache,
3745                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3746                    net->ipv6.rt6_stats->fib_discarded_routes);
3747
3748         return 0;
3749 }
3750
3751 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3752 {
3753         return single_open_net(inode, file, rt6_stats_seq_show);
3754 }
3755
3756 static const struct file_operations rt6_stats_seq_fops = {
3757         .owner   = THIS_MODULE,
3758         .open    = rt6_stats_seq_open,
3759         .read    = seq_read,
3760         .llseek  = seq_lseek,
3761         .release = single_release_net,
3762 };
3763 #endif  /* CONFIG_PROC_FS */
3764
3765 #ifdef CONFIG_SYSCTL
3766
3767 static
3768 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3769                               void __user *buffer, size_t *lenp, loff_t *ppos)
3770 {
3771         struct net *net;
3772         int delay;
3773         if (!write)
3774                 return -EINVAL;
3775
3776         net = (struct net *)ctl->extra1;
3777         delay = net->ipv6.sysctl.flush_delay;
3778         proc_dointvec(ctl, write, buffer, lenp, ppos);
3779         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3780         return 0;
3781 }
3782
3783 struct ctl_table ipv6_route_table_template[] = {
3784         {
3785                 .procname       =       "flush",
3786                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3787                 .maxlen         =       sizeof(int),
3788                 .mode           =       0200,
3789                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3790         },
3791         {
3792                 .procname       =       "gc_thresh",
3793                 .data           =       &ip6_dst_ops_template.gc_thresh,
3794                 .maxlen         =       sizeof(int),
3795                 .mode           =       0644,
3796                 .proc_handler   =       proc_dointvec,
3797         },
3798         {
3799                 .procname       =       "max_size",
3800                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3801                 .maxlen         =       sizeof(int),
3802                 .mode           =       0644,
3803                 .proc_handler   =       proc_dointvec,
3804         },
3805         {
3806                 .procname       =       "gc_min_interval",
3807                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3808                 .maxlen         =       sizeof(int),
3809                 .mode           =       0644,
3810                 .proc_handler   =       proc_dointvec_jiffies,
3811         },
3812         {
3813                 .procname       =       "gc_timeout",
3814                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3815                 .maxlen         =       sizeof(int),
3816                 .mode           =       0644,
3817                 .proc_handler   =       proc_dointvec_jiffies,
3818         },
3819         {
3820                 .procname       =       "gc_interval",
3821                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3822                 .maxlen         =       sizeof(int),
3823                 .mode           =       0644,
3824                 .proc_handler   =       proc_dointvec_jiffies,
3825         },
3826         {
3827                 .procname       =       "gc_elasticity",
3828                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3829                 .maxlen         =       sizeof(int),
3830                 .mode           =       0644,
3831                 .proc_handler   =       proc_dointvec,
3832         },
3833         {
3834                 .procname       =       "mtu_expires",
3835                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3836                 .maxlen         =       sizeof(int),
3837                 .mode           =       0644,
3838                 .proc_handler   =       proc_dointvec_jiffies,
3839         },
3840         {
3841                 .procname       =       "min_adv_mss",
3842                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3843                 .maxlen         =       sizeof(int),
3844                 .mode           =       0644,
3845                 .proc_handler   =       proc_dointvec,
3846         },
3847         {
3848                 .procname       =       "gc_min_interval_ms",
3849                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3850                 .maxlen         =       sizeof(int),
3851                 .mode           =       0644,
3852                 .proc_handler   =       proc_dointvec_ms_jiffies,
3853         },
3854         { }
3855 };
3856
3857 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3858 {
3859         struct ctl_table *table;
3860
3861         table = kmemdup(ipv6_route_table_template,
3862                         sizeof(ipv6_route_table_template),
3863                         GFP_KERNEL);
3864
3865         if (table) {
3866                 table[0].data = &net->ipv6.sysctl.flush_delay;
3867                 table[0].extra1 = net;
3868                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3869                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3870                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3871                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3872                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3873                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3874                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3875                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3876                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3877
3878                 /* Don't export sysctls to unprivileged users */
3879                 if (net->user_ns != &init_user_ns)
3880                         table[0].procname = NULL;
3881         }
3882
3883         return table;
3884 }
3885 #endif
3886
3887 static int __net_init ip6_route_net_init(struct net *net)
3888 {
3889         int ret = -ENOMEM;
3890
3891         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3892                sizeof(net->ipv6.ip6_dst_ops));
3893
3894         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3895                 goto out_ip6_dst_ops;
3896
3897         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3898                                            sizeof(*net->ipv6.ip6_null_entry),
3899                                            GFP_KERNEL);
3900         if (!net->ipv6.ip6_null_entry)
3901                 goto out_ip6_dst_entries;
3902         net->ipv6.ip6_null_entry->dst.path =
3903                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3904         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3905         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3906                          ip6_template_metrics, true);
3907
3908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3909         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3910                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3911                                                GFP_KERNEL);
3912         if (!net->ipv6.ip6_prohibit_entry)
3913                 goto out_ip6_null_entry;
3914         net->ipv6.ip6_prohibit_entry->dst.path =
3915                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3916         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3917         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3918                          ip6_template_metrics, true);
3919
3920         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3921                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3922                                                GFP_KERNEL);
3923         if (!net->ipv6.ip6_blk_hole_entry)
3924                 goto out_ip6_prohibit_entry;
3925         net->ipv6.ip6_blk_hole_entry->dst.path =
3926                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3927         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3928         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3929                          ip6_template_metrics, true);
3930 #endif
3931
3932         net->ipv6.sysctl.flush_delay = 0;
3933         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3934         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3935         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3936         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3937         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3938         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3939         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3940
3941         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3942
3943         ret = 0;
3944 out:
3945         return ret;
3946
3947 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3948 out_ip6_prohibit_entry:
3949         kfree(net->ipv6.ip6_prohibit_entry);
3950 out_ip6_null_entry:
3951         kfree(net->ipv6.ip6_null_entry);
3952 #endif
3953 out_ip6_dst_entries:
3954         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3955 out_ip6_dst_ops:
3956         goto out;
3957 }
3958
3959 static void __net_exit ip6_route_net_exit(struct net *net)
3960 {
3961         kfree(net->ipv6.ip6_null_entry);
3962 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3963         kfree(net->ipv6.ip6_prohibit_entry);
3964         kfree(net->ipv6.ip6_blk_hole_entry);
3965 #endif
3966         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3967 }
3968
3969 static int __net_init ip6_route_net_init_late(struct net *net)
3970 {
3971 #ifdef CONFIG_PROC_FS
3972         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3973         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3974 #endif
3975         return 0;
3976 }
3977
3978 static void __net_exit ip6_route_net_exit_late(struct net *net)
3979 {
3980 #ifdef CONFIG_PROC_FS
3981         remove_proc_entry("ipv6_route", net->proc_net);
3982         remove_proc_entry("rt6_stats", net->proc_net);
3983 #endif
3984 }
3985
3986 static struct pernet_operations ip6_route_net_ops = {
3987         .init = ip6_route_net_init,
3988         .exit = ip6_route_net_exit,
3989 };
3990
3991 static int __net_init ipv6_inetpeer_init(struct net *net)
3992 {
3993         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3994
3995         if (!bp)
3996                 return -ENOMEM;
3997         inet_peer_base_init(bp);
3998         net->ipv6.peers = bp;
3999         return 0;
4000 }
4001
4002 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4003 {
4004         struct inet_peer_base *bp = net->ipv6.peers;
4005
4006         net->ipv6.peers = NULL;
4007         inetpeer_invalidate_tree(bp);
4008         kfree(bp);
4009 }
4010
4011 static struct pernet_operations ipv6_inetpeer_ops = {
4012         .init   =       ipv6_inetpeer_init,
4013         .exit   =       ipv6_inetpeer_exit,
4014 };
4015
4016 static struct pernet_operations ip6_route_net_late_ops = {
4017         .init = ip6_route_net_init_late,
4018         .exit = ip6_route_net_exit_late,
4019 };
4020
4021 static struct notifier_block ip6_route_dev_notifier = {
4022         .notifier_call = ip6_route_dev_notify,
4023         .priority = 0,
4024 };
4025
4026 int __init ip6_route_init(void)
4027 {
4028         int ret;
4029         int cpu;
4030
4031         ret = -ENOMEM;
4032         ip6_dst_ops_template.kmem_cachep =
4033                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4034                                   SLAB_HWCACHE_ALIGN, NULL);
4035         if (!ip6_dst_ops_template.kmem_cachep)
4036                 goto out;
4037
4038         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4039         if (ret)
4040                 goto out_kmem_cache;
4041
4042         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4043         if (ret)
4044                 goto out_dst_entries;
4045
4046         ret = register_pernet_subsys(&ip6_route_net_ops);
4047         if (ret)
4048                 goto out_register_inetpeer;
4049
4050         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4051
4052         /* Registering of the loopback is done before this portion of code,
4053          * the loopback reference in rt6_info will not be taken, do it
4054          * manually for init_net */
4055         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4056         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4057   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4058         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4059         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4060         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4061         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4062   #endif
4063         ret = fib6_init();
4064         if (ret)
4065                 goto out_register_subsys;
4066
4067         ret = xfrm6_init();
4068         if (ret)
4069                 goto out_fib6_init;
4070
4071         ret = fib6_rules_init();
4072         if (ret)
4073                 goto xfrm6_init;
4074
4075         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4076         if (ret)
4077                 goto fib6_rules_init;
4078
4079         ret = -ENOBUFS;
4080         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4081             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4082             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4083                 goto out_register_late_subsys;
4084
4085         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4086         if (ret)
4087                 goto out_register_late_subsys;
4088
4089         for_each_possible_cpu(cpu) {
4090                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4091
4092                 INIT_LIST_HEAD(&ul->head);
4093                 spin_lock_init(&ul->lock);
4094         }
4095
4096 out:
4097         return ret;
4098
4099 out_register_late_subsys:
4100         unregister_pernet_subsys(&ip6_route_net_late_ops);
4101 fib6_rules_init:
4102         fib6_rules_cleanup();
4103 xfrm6_init:
4104         xfrm6_fini();
4105 out_fib6_init:
4106         fib6_gc_cleanup();
4107 out_register_subsys:
4108         unregister_pernet_subsys(&ip6_route_net_ops);
4109 out_register_inetpeer:
4110         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4111 out_dst_entries:
4112         dst_entries_destroy(&ip6_dst_blackhole_ops);
4113 out_kmem_cache:
4114         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4115         goto out;
4116 }
4117
4118 void ip6_route_cleanup(void)
4119 {
4120         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4121         unregister_pernet_subsys(&ip6_route_net_late_ops);
4122         fib6_rules_cleanup();
4123         xfrm6_fini();
4124         fib6_gc_cleanup();
4125         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4126         unregister_pernet_subsys(&ip6_route_net_ops);
4127         dst_entries_destroy(&ip6_dst_blackhole_ops);
4128         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4129 }