net: Pass neighbours and dest address into NETEVENT_REDIRECT events.
[profile/ivi/kernel-adaptation-intel-automotive.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         peer = rt6_get_peer_create(rt);
103         if (peer) {
104                 u32 *old_p = __DST_METRICS_PTR(old);
105                 unsigned long prev, new;
106
107                 p = peer->metrics;
108                 if (inet_metrics_new(peer))
109                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
110
111                 new = (unsigned long) p;
112                 prev = cmpxchg(&dst->_metrics, old, new);
113
114                 if (prev != old) {
115                         p = __DST_METRICS_PTR(prev);
116                         if (prev & DST_METRICS_READ_ONLY)
117                                 p = NULL;
118                 }
119         }
120         return p;
121 }
122
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
124                                              struct sk_buff *skb,
125                                              const void *daddr)
126 {
127         struct in6_addr *p = &rt->rt6i_gateway;
128
129         if (!ipv6_addr_any(p))
130                 return (const void *) p;
131         else if (skb)
132                 return &ipv6_hdr(skb)->daddr;
133         return daddr;
134 }
135
136 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
137                                           struct sk_buff *skb,
138                                           const void *daddr)
139 {
140         struct rt6_info *rt = (struct rt6_info *) dst;
141         struct neighbour *n;
142
143         daddr = choose_neigh_daddr(rt, skb, daddr);
144         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
145         if (n)
146                 return n;
147         return neigh_create(&nd_tbl, daddr, dst->dev);
148 }
149
150 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
151 {
152         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
153         if (!n) {
154                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
155                 if (IS_ERR(n))
156                         return PTR_ERR(n);
157         }
158         dst_set_neighbour(&rt->dst, n);
159
160         return 0;
161 }
162
163 static struct dst_ops ip6_dst_ops_template = {
164         .family                 =       AF_INET6,
165         .protocol               =       cpu_to_be16(ETH_P_IPV6),
166         .gc                     =       ip6_dst_gc,
167         .gc_thresh              =       1024,
168         .check                  =       ip6_dst_check,
169         .default_advmss         =       ip6_default_advmss,
170         .mtu                    =       ip6_mtu,
171         .cow_metrics            =       ipv6_cow_metrics,
172         .destroy                =       ip6_dst_destroy,
173         .ifdown                 =       ip6_dst_ifdown,
174         .negative_advice        =       ip6_negative_advice,
175         .link_failure           =       ip6_link_failure,
176         .update_pmtu            =       ip6_rt_update_pmtu,
177         .local_out              =       __ip6_local_out,
178         .neigh_lookup           =       ip6_neigh_lookup,
179 };
180
181 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
182 {
183         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
184
185         return mtu ? : dst->dev->mtu;
186 }
187
188 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
189 {
190 }
191
192 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
193                                          unsigned long old)
194 {
195         return NULL;
196 }
197
198 static struct dst_ops ip6_dst_blackhole_ops = {
199         .family                 =       AF_INET6,
200         .protocol               =       cpu_to_be16(ETH_P_IPV6),
201         .destroy                =       ip6_dst_destroy,
202         .check                  =       ip6_dst_check,
203         .mtu                    =       ip6_blackhole_mtu,
204         .default_advmss         =       ip6_default_advmss,
205         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
206         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
207         .neigh_lookup           =       ip6_neigh_lookup,
208 };
209
210 static const u32 ip6_template_metrics[RTAX_MAX] = {
211         [RTAX_HOPLIMIT - 1] = 255,
212 };
213
214 static struct rt6_info ip6_null_entry_template = {
215         .dst = {
216                 .__refcnt       = ATOMIC_INIT(1),
217                 .__use          = 1,
218                 .obsolete       = -1,
219                 .error          = -ENETUNREACH,
220                 .input          = ip6_pkt_discard,
221                 .output         = ip6_pkt_discard_out,
222         },
223         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
224         .rt6i_protocol  = RTPROT_KERNEL,
225         .rt6i_metric    = ~(u32) 0,
226         .rt6i_ref       = ATOMIC_INIT(1),
227 };
228
229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
230
231 static int ip6_pkt_prohibit(struct sk_buff *skb);
232 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
233
234 static struct rt6_info ip6_prohibit_entry_template = {
235         .dst = {
236                 .__refcnt       = ATOMIC_INIT(1),
237                 .__use          = 1,
238                 .obsolete       = -1,
239                 .error          = -EACCES,
240                 .input          = ip6_pkt_prohibit,
241                 .output         = ip6_pkt_prohibit_out,
242         },
243         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
244         .rt6i_protocol  = RTPROT_KERNEL,
245         .rt6i_metric    = ~(u32) 0,
246         .rt6i_ref       = ATOMIC_INIT(1),
247 };
248
249 static struct rt6_info ip6_blk_hole_entry_template = {
250         .dst = {
251                 .__refcnt       = ATOMIC_INIT(1),
252                 .__use          = 1,
253                 .obsolete       = -1,
254                 .error          = -EINVAL,
255                 .input          = dst_discard,
256                 .output         = dst_discard,
257         },
258         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
259         .rt6i_protocol  = RTPROT_KERNEL,
260         .rt6i_metric    = ~(u32) 0,
261         .rt6i_ref       = ATOMIC_INIT(1),
262 };
263
264 #endif
265
266 /* allocate dst with ip6_dst_ops */
267 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
268                                              struct net_device *dev,
269                                              int flags,
270                                              struct fib6_table *table)
271 {
272         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
273                                         0, 0, flags);
274
275         if (rt) {
276                 memset(&rt->rt6i_table, 0,
277                        sizeof(*rt) - sizeof(struct dst_entry));
278                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
279         }
280         return rt;
281 }
282
283 static void ip6_dst_destroy(struct dst_entry *dst)
284 {
285         struct rt6_info *rt = (struct rt6_info *)dst;
286         struct inet6_dev *idev = rt->rt6i_idev;
287
288         if (!(rt->dst.flags & DST_HOST))
289                 dst_destroy_metrics_generic(dst);
290
291         if (idev) {
292                 rt->rt6i_idev = NULL;
293                 in6_dev_put(idev);
294         }
295
296         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
297                 dst_release(dst->from);
298
299         if (rt6_has_peer(rt)) {
300                 struct inet_peer *peer = rt6_peer_ptr(rt);
301                 inet_putpeer(peer);
302         }
303 }
304
305 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
306
307 static u32 rt6_peer_genid(void)
308 {
309         return atomic_read(&__rt6_peer_genid);
310 }
311
312 void rt6_bind_peer(struct rt6_info *rt, int create)
313 {
314         struct inet_peer_base *base;
315         struct inet_peer *peer;
316
317         base = inetpeer_base_ptr(rt->_rt6i_peer);
318         if (!base)
319                 return;
320
321         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
322         if (peer) {
323                 if (!rt6_set_peer(rt, peer))
324                         inet_putpeer(peer);
325                 else
326                         rt->rt6i_peer_genid = rt6_peer_genid();
327         }
328 }
329
330 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
331                            int how)
332 {
333         struct rt6_info *rt = (struct rt6_info *)dst;
334         struct inet6_dev *idev = rt->rt6i_idev;
335         struct net_device *loopback_dev =
336                 dev_net(dev)->loopback_dev;
337
338         if (dev != loopback_dev && idev && idev->dev == dev) {
339                 struct inet6_dev *loopback_idev =
340                         in6_dev_get(loopback_dev);
341                 if (loopback_idev) {
342                         rt->rt6i_idev = loopback_idev;
343                         in6_dev_put(idev);
344                 }
345         }
346 }
347
348 static bool rt6_check_expired(const struct rt6_info *rt)
349 {
350         struct rt6_info *ort = NULL;
351
352         if (rt->rt6i_flags & RTF_EXPIRES) {
353                 if (time_after(jiffies, rt->dst.expires))
354                         return true;
355         } else if (rt->dst.from) {
356                 ort = (struct rt6_info *) rt->dst.from;
357                 return (ort->rt6i_flags & RTF_EXPIRES) &&
358                         time_after(jiffies, ort->dst.expires);
359         }
360         return false;
361 }
362
363 static bool rt6_need_strict(const struct in6_addr *daddr)
364 {
365         return ipv6_addr_type(daddr) &
366                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
367 }
368
369 /*
370  *      Route lookup. Any table->tb6_lock is implied.
371  */
372
373 static inline struct rt6_info *rt6_device_match(struct net *net,
374                                                     struct rt6_info *rt,
375                                                     const struct in6_addr *saddr,
376                                                     int oif,
377                                                     int flags)
378 {
379         struct rt6_info *local = NULL;
380         struct rt6_info *sprt;
381
382         if (!oif && ipv6_addr_any(saddr))
383                 goto out;
384
385         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
386                 struct net_device *dev = sprt->dst.dev;
387
388                 if (oif) {
389                         if (dev->ifindex == oif)
390                                 return sprt;
391                         if (dev->flags & IFF_LOOPBACK) {
392                                 if (!sprt->rt6i_idev ||
393                                     sprt->rt6i_idev->dev->ifindex != oif) {
394                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
395                                                 continue;
396                                         if (local && (!oif ||
397                                                       local->rt6i_idev->dev->ifindex == oif))
398                                                 continue;
399                                 }
400                                 local = sprt;
401                         }
402                 } else {
403                         if (ipv6_chk_addr(net, saddr, dev,
404                                           flags & RT6_LOOKUP_F_IFACE))
405                                 return sprt;
406                 }
407         }
408
409         if (oif) {
410                 if (local)
411                         return local;
412
413                 if (flags & RT6_LOOKUP_F_IFACE)
414                         return net->ipv6.ip6_null_entry;
415         }
416 out:
417         return rt;
418 }
419
420 #ifdef CONFIG_IPV6_ROUTER_PREF
421 static void rt6_probe(struct rt6_info *rt)
422 {
423         struct neighbour *neigh;
424         /*
425          * Okay, this does not seem to be appropriate
426          * for now, however, we need to check if it
427          * is really so; aka Router Reachability Probing.
428          *
429          * Router Reachability Probe MUST be rate-limited
430          * to no more than one per minute.
431          */
432         rcu_read_lock();
433         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
434         if (!neigh || (neigh->nud_state & NUD_VALID))
435                 goto out;
436         read_lock_bh(&neigh->lock);
437         if (!(neigh->nud_state & NUD_VALID) &&
438             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
439                 struct in6_addr mcaddr;
440                 struct in6_addr *target;
441
442                 neigh->updated = jiffies;
443                 read_unlock_bh(&neigh->lock);
444
445                 target = (struct in6_addr *)&neigh->primary_key;
446                 addrconf_addr_solict_mult(target, &mcaddr);
447                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
448         } else {
449                 read_unlock_bh(&neigh->lock);
450         }
451 out:
452         rcu_read_unlock();
453 }
454 #else
455 static inline void rt6_probe(struct rt6_info *rt)
456 {
457 }
458 #endif
459
460 /*
461  * Default Router Selection (RFC 2461 6.3.6)
462  */
463 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
464 {
465         struct net_device *dev = rt->dst.dev;
466         if (!oif || dev->ifindex == oif)
467                 return 2;
468         if ((dev->flags & IFF_LOOPBACK) &&
469             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
470                 return 1;
471         return 0;
472 }
473
474 static inline int rt6_check_neigh(struct rt6_info *rt)
475 {
476         struct neighbour *neigh;
477         int m;
478
479         rcu_read_lock();
480         neigh = dst_get_neighbour_noref(&rt->dst);
481         if (rt->rt6i_flags & RTF_NONEXTHOP ||
482             !(rt->rt6i_flags & RTF_GATEWAY))
483                 m = 1;
484         else if (neigh) {
485                 read_lock_bh(&neigh->lock);
486                 if (neigh->nud_state & NUD_VALID)
487                         m = 2;
488 #ifdef CONFIG_IPV6_ROUTER_PREF
489                 else if (neigh->nud_state & NUD_FAILED)
490                         m = 0;
491 #endif
492                 else
493                         m = 1;
494                 read_unlock_bh(&neigh->lock);
495         } else
496                 m = 0;
497         rcu_read_unlock();
498         return m;
499 }
500
501 static int rt6_score_route(struct rt6_info *rt, int oif,
502                            int strict)
503 {
504         int m, n;
505
506         m = rt6_check_dev(rt, oif);
507         if (!m && (strict & RT6_LOOKUP_F_IFACE))
508                 return -1;
509 #ifdef CONFIG_IPV6_ROUTER_PREF
510         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
511 #endif
512         n = rt6_check_neigh(rt);
513         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
514                 return -1;
515         return m;
516 }
517
518 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
519                                    int *mpri, struct rt6_info *match)
520 {
521         int m;
522
523         if (rt6_check_expired(rt))
524                 goto out;
525
526         m = rt6_score_route(rt, oif, strict);
527         if (m < 0)
528                 goto out;
529
530         if (m > *mpri) {
531                 if (strict & RT6_LOOKUP_F_REACHABLE)
532                         rt6_probe(match);
533                 *mpri = m;
534                 match = rt;
535         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
536                 rt6_probe(rt);
537         }
538
539 out:
540         return match;
541 }
542
543 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
544                                      struct rt6_info *rr_head,
545                                      u32 metric, int oif, int strict)
546 {
547         struct rt6_info *rt, *match;
548         int mpri = -1;
549
550         match = NULL;
551         for (rt = rr_head; rt && rt->rt6i_metric == metric;
552              rt = rt->dst.rt6_next)
553                 match = find_match(rt, oif, strict, &mpri, match);
554         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
555              rt = rt->dst.rt6_next)
556                 match = find_match(rt, oif, strict, &mpri, match);
557
558         return match;
559 }
560
561 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
562 {
563         struct rt6_info *match, *rt0;
564         struct net *net;
565
566         rt0 = fn->rr_ptr;
567         if (!rt0)
568                 fn->rr_ptr = rt0 = fn->leaf;
569
570         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
571
572         if (!match &&
573             (strict & RT6_LOOKUP_F_REACHABLE)) {
574                 struct rt6_info *next = rt0->dst.rt6_next;
575
576                 /* no entries matched; do round-robin */
577                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
578                         next = fn->leaf;
579
580                 if (next != rt0)
581                         fn->rr_ptr = next;
582         }
583
584         net = dev_net(rt0->dst.dev);
585         return match ? match : net->ipv6.ip6_null_entry;
586 }
587
588 #ifdef CONFIG_IPV6_ROUTE_INFO
589 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
590                   const struct in6_addr *gwaddr)
591 {
592         struct net *net = dev_net(dev);
593         struct route_info *rinfo = (struct route_info *) opt;
594         struct in6_addr prefix_buf, *prefix;
595         unsigned int pref;
596         unsigned long lifetime;
597         struct rt6_info *rt;
598
599         if (len < sizeof(struct route_info)) {
600                 return -EINVAL;
601         }
602
603         /* Sanity check for prefix_len and length */
604         if (rinfo->length > 3) {
605                 return -EINVAL;
606         } else if (rinfo->prefix_len > 128) {
607                 return -EINVAL;
608         } else if (rinfo->prefix_len > 64) {
609                 if (rinfo->length < 2) {
610                         return -EINVAL;
611                 }
612         } else if (rinfo->prefix_len > 0) {
613                 if (rinfo->length < 1) {
614                         return -EINVAL;
615                 }
616         }
617
618         pref = rinfo->route_pref;
619         if (pref == ICMPV6_ROUTER_PREF_INVALID)
620                 return -EINVAL;
621
622         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
623
624         if (rinfo->length == 3)
625                 prefix = (struct in6_addr *)rinfo->prefix;
626         else {
627                 /* this function is safe */
628                 ipv6_addr_prefix(&prefix_buf,
629                                  (struct in6_addr *)rinfo->prefix,
630                                  rinfo->prefix_len);
631                 prefix = &prefix_buf;
632         }
633
634         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
635                                 dev->ifindex);
636
637         if (rt && !lifetime) {
638                 ip6_del_rt(rt);
639                 rt = NULL;
640         }
641
642         if (!rt && lifetime)
643                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
644                                         pref);
645         else if (rt)
646                 rt->rt6i_flags = RTF_ROUTEINFO |
647                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
648
649         if (rt) {
650                 if (!addrconf_finite_timeout(lifetime))
651                         rt6_clean_expires(rt);
652                 else
653                         rt6_set_expires(rt, jiffies + HZ * lifetime);
654
655                 dst_release(&rt->dst);
656         }
657         return 0;
658 }
659 #endif
660
661 #define BACKTRACK(__net, saddr)                 \
662 do { \
663         if (rt == __net->ipv6.ip6_null_entry) { \
664                 struct fib6_node *pn; \
665                 while (1) { \
666                         if (fn->fn_flags & RTN_TL_ROOT) \
667                                 goto out; \
668                         pn = fn->parent; \
669                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
670                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
671                         else \
672                                 fn = pn; \
673                         if (fn->fn_flags & RTN_RTINFO) \
674                                 goto restart; \
675                 } \
676         } \
677 } while (0)
678
679 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
680                                              struct fib6_table *table,
681                                              struct flowi6 *fl6, int flags)
682 {
683         struct fib6_node *fn;
684         struct rt6_info *rt;
685
686         read_lock_bh(&table->tb6_lock);
687         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
688 restart:
689         rt = fn->leaf;
690         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
691         BACKTRACK(net, &fl6->saddr);
692 out:
693         dst_use(&rt->dst, jiffies);
694         read_unlock_bh(&table->tb6_lock);
695         return rt;
696
697 }
698
699 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
700                                     int flags)
701 {
702         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
703 }
704 EXPORT_SYMBOL_GPL(ip6_route_lookup);
705
706 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
707                             const struct in6_addr *saddr, int oif, int strict)
708 {
709         struct flowi6 fl6 = {
710                 .flowi6_oif = oif,
711                 .daddr = *daddr,
712         };
713         struct dst_entry *dst;
714         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
715
716         if (saddr) {
717                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
718                 flags |= RT6_LOOKUP_F_HAS_SADDR;
719         }
720
721         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
722         if (dst->error == 0)
723                 return (struct rt6_info *) dst;
724
725         dst_release(dst);
726
727         return NULL;
728 }
729
730 EXPORT_SYMBOL(rt6_lookup);
731
732 /* ip6_ins_rt is called with FREE table->tb6_lock.
733    It takes new route entry, the addition fails by any reason the
734    route is freed. In any case, if caller does not hold it, it may
735    be destroyed.
736  */
737
738 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
739 {
740         int err;
741         struct fib6_table *table;
742
743         table = rt->rt6i_table;
744         write_lock_bh(&table->tb6_lock);
745         err = fib6_add(&table->tb6_root, rt, info);
746         write_unlock_bh(&table->tb6_lock);
747
748         return err;
749 }
750
751 int ip6_ins_rt(struct rt6_info *rt)
752 {
753         struct nl_info info = {
754                 .nl_net = dev_net(rt->dst.dev),
755         };
756         return __ip6_ins_rt(rt, &info);
757 }
758
759 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
760                                       const struct in6_addr *daddr,
761                                       const struct in6_addr *saddr)
762 {
763         struct rt6_info *rt;
764
765         /*
766          *      Clone the route.
767          */
768
769         rt = ip6_rt_copy(ort, daddr);
770
771         if (rt) {
772                 int attempts = !in_softirq();
773
774                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
775                         if (ort->rt6i_dst.plen != 128 &&
776                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
777                                 rt->rt6i_flags |= RTF_ANYCAST;
778                         rt->rt6i_gateway = *daddr;
779                 }
780
781                 rt->rt6i_flags |= RTF_CACHE;
782
783 #ifdef CONFIG_IPV6_SUBTREES
784                 if (rt->rt6i_src.plen && saddr) {
785                         rt->rt6i_src.addr = *saddr;
786                         rt->rt6i_src.plen = 128;
787                 }
788 #endif
789
790         retry:
791                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
792                         struct net *net = dev_net(rt->dst.dev);
793                         int saved_rt_min_interval =
794                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
795                         int saved_rt_elasticity =
796                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
797
798                         if (attempts-- > 0) {
799                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
800                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
801
802                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
803
804                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
805                                         saved_rt_elasticity;
806                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
807                                         saved_rt_min_interval;
808                                 goto retry;
809                         }
810
811                         net_warn_ratelimited("Neighbour table overflow\n");
812                         dst_free(&rt->dst);
813                         return NULL;
814                 }
815         }
816
817         return rt;
818 }
819
820 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
821                                         const struct in6_addr *daddr)
822 {
823         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
824
825         if (rt) {
826                 rt->rt6i_flags |= RTF_CACHE;
827                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
828         }
829         return rt;
830 }
831
832 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
833                                       struct flowi6 *fl6, int flags)
834 {
835         struct fib6_node *fn;
836         struct rt6_info *rt, *nrt;
837         int strict = 0;
838         int attempts = 3;
839         int err;
840         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
841
842         strict |= flags & RT6_LOOKUP_F_IFACE;
843
844 relookup:
845         read_lock_bh(&table->tb6_lock);
846
847 restart_2:
848         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
849
850 restart:
851         rt = rt6_select(fn, oif, strict | reachable);
852
853         BACKTRACK(net, &fl6->saddr);
854         if (rt == net->ipv6.ip6_null_entry ||
855             rt->rt6i_flags & RTF_CACHE)
856                 goto out;
857
858         dst_hold(&rt->dst);
859         read_unlock_bh(&table->tb6_lock);
860
861         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
862                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
863         else if (!(rt->dst.flags & DST_HOST))
864                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
865         else
866                 goto out2;
867
868         dst_release(&rt->dst);
869         rt = nrt ? : net->ipv6.ip6_null_entry;
870
871         dst_hold(&rt->dst);
872         if (nrt) {
873                 err = ip6_ins_rt(nrt);
874                 if (!err)
875                         goto out2;
876         }
877
878         if (--attempts <= 0)
879                 goto out2;
880
881         /*
882          * Race condition! In the gap, when table->tb6_lock was
883          * released someone could insert this route.  Relookup.
884          */
885         dst_release(&rt->dst);
886         goto relookup;
887
888 out:
889         if (reachable) {
890                 reachable = 0;
891                 goto restart_2;
892         }
893         dst_hold(&rt->dst);
894         read_unlock_bh(&table->tb6_lock);
895 out2:
896         rt->dst.lastuse = jiffies;
897         rt->dst.__use++;
898
899         return rt;
900 }
901
902 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
903                                             struct flowi6 *fl6, int flags)
904 {
905         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
906 }
907
908 static struct dst_entry *ip6_route_input_lookup(struct net *net,
909                                                 struct net_device *dev,
910                                                 struct flowi6 *fl6, int flags)
911 {
912         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
913                 flags |= RT6_LOOKUP_F_IFACE;
914
915         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
916 }
917
918 void ip6_route_input(struct sk_buff *skb)
919 {
920         const struct ipv6hdr *iph = ipv6_hdr(skb);
921         struct net *net = dev_net(skb->dev);
922         int flags = RT6_LOOKUP_F_HAS_SADDR;
923         struct flowi6 fl6 = {
924                 .flowi6_iif = skb->dev->ifindex,
925                 .daddr = iph->daddr,
926                 .saddr = iph->saddr,
927                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
928                 .flowi6_mark = skb->mark,
929                 .flowi6_proto = iph->nexthdr,
930         };
931
932         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
933 }
934
935 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
936                                              struct flowi6 *fl6, int flags)
937 {
938         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
939 }
940
941 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
942                                     struct flowi6 *fl6)
943 {
944         int flags = 0;
945
946         fl6->flowi6_iif = net->loopback_dev->ifindex;
947
948         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
949                 flags |= RT6_LOOKUP_F_IFACE;
950
951         if (!ipv6_addr_any(&fl6->saddr))
952                 flags |= RT6_LOOKUP_F_HAS_SADDR;
953         else if (sk)
954                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
955
956         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
957 }
958
959 EXPORT_SYMBOL(ip6_route_output);
960
961 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
962 {
963         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
964         struct dst_entry *new = NULL;
965
966         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
967         if (rt) {
968                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
969                 rt6_init_peer(rt, net->ipv6.peers);
970
971                 new = &rt->dst;
972
973                 new->__use = 1;
974                 new->input = dst_discard;
975                 new->output = dst_discard;
976
977                 if (dst_metrics_read_only(&ort->dst))
978                         new->_metrics = ort->dst._metrics;
979                 else
980                         dst_copy_metrics(new, &ort->dst);
981                 rt->rt6i_idev = ort->rt6i_idev;
982                 if (rt->rt6i_idev)
983                         in6_dev_hold(rt->rt6i_idev);
984
985                 rt->rt6i_gateway = ort->rt6i_gateway;
986                 rt->rt6i_flags = ort->rt6i_flags;
987                 rt6_clean_expires(rt);
988                 rt->rt6i_metric = 0;
989
990                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
991 #ifdef CONFIG_IPV6_SUBTREES
992                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
993 #endif
994
995                 dst_free(new);
996         }
997
998         dst_release(dst_orig);
999         return new ? new : ERR_PTR(-ENOMEM);
1000 }
1001
1002 /*
1003  *      Destination cache support functions
1004  */
1005
1006 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1007 {
1008         struct rt6_info *rt;
1009
1010         rt = (struct rt6_info *) dst;
1011
1012         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1013                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1014                         if (!rt6_has_peer(rt))
1015                                 rt6_bind_peer(rt, 0);
1016                         rt->rt6i_peer_genid = rt6_peer_genid();
1017                 }
1018                 return dst;
1019         }
1020         return NULL;
1021 }
1022
1023 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1024 {
1025         struct rt6_info *rt = (struct rt6_info *) dst;
1026
1027         if (rt) {
1028                 if (rt->rt6i_flags & RTF_CACHE) {
1029                         if (rt6_check_expired(rt)) {
1030                                 ip6_del_rt(rt);
1031                                 dst = NULL;
1032                         }
1033                 } else {
1034                         dst_release(dst);
1035                         dst = NULL;
1036                 }
1037         }
1038         return dst;
1039 }
1040
1041 static void ip6_link_failure(struct sk_buff *skb)
1042 {
1043         struct rt6_info *rt;
1044
1045         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1046
1047         rt = (struct rt6_info *) skb_dst(skb);
1048         if (rt) {
1049                 if (rt->rt6i_flags & RTF_CACHE)
1050                         rt6_update_expires(rt, 0);
1051                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1052                         rt->rt6i_node->fn_sernum = -1;
1053         }
1054 }
1055
1056 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1057 {
1058         struct rt6_info *rt6 = (struct rt6_info*)dst;
1059
1060         dst_confirm(dst);
1061         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1062                 struct net *net = dev_net(dst->dev);
1063
1064                 rt6->rt6i_flags |= RTF_MODIFIED;
1065                 if (mtu < IPV6_MIN_MTU) {
1066                         u32 features = dst_metric(dst, RTAX_FEATURES);
1067                         mtu = IPV6_MIN_MTU;
1068                         features |= RTAX_FEATURE_ALLFRAG;
1069                         dst_metric_set(dst, RTAX_FEATURES, features);
1070                 }
1071                 dst_metric_set(dst, RTAX_MTU, mtu);
1072                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1073         }
1074 }
1075
1076 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1077                      int oif, u32 mark)
1078 {
1079         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1080         struct dst_entry *dst;
1081         struct flowi6 fl6;
1082
1083         memset(&fl6, 0, sizeof(fl6));
1084         fl6.flowi6_oif = oif;
1085         fl6.flowi6_mark = mark;
1086         fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1087         fl6.daddr = iph->daddr;
1088         fl6.saddr = iph->saddr;
1089         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1090
1091         dst = ip6_route_output(net, NULL, &fl6);
1092         if (!dst->error)
1093                 ip6_rt_update_pmtu(dst, ntohl(mtu));
1094         dst_release(dst);
1095 }
1096 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1097
1098 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1099 {
1100         ip6_update_pmtu(skb, sock_net(sk), mtu,
1101                         sk->sk_bound_dev_if, sk->sk_mark);
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1104
1105 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1106 {
1107         struct net_device *dev = dst->dev;
1108         unsigned int mtu = dst_mtu(dst);
1109         struct net *net = dev_net(dev);
1110
1111         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1112
1113         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1114                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1115
1116         /*
1117          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1118          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1119          * IPV6_MAXPLEN is also valid and means: "any MSS,
1120          * rely only on pmtu discovery"
1121          */
1122         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1123                 mtu = IPV6_MAXPLEN;
1124         return mtu;
1125 }
1126
1127 static unsigned int ip6_mtu(const struct dst_entry *dst)
1128 {
1129         struct inet6_dev *idev;
1130         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1131
1132         if (mtu)
1133                 return mtu;
1134
1135         mtu = IPV6_MIN_MTU;
1136
1137         rcu_read_lock();
1138         idev = __in6_dev_get(dst->dev);
1139         if (idev)
1140                 mtu = idev->cnf.mtu6;
1141         rcu_read_unlock();
1142
1143         return mtu;
1144 }
1145
1146 static struct dst_entry *icmp6_dst_gc_list;
1147 static DEFINE_SPINLOCK(icmp6_dst_lock);
1148
1149 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1150                                   struct neighbour *neigh,
1151                                   struct flowi6 *fl6)
1152 {
1153         struct dst_entry *dst;
1154         struct rt6_info *rt;
1155         struct inet6_dev *idev = in6_dev_get(dev);
1156         struct net *net = dev_net(dev);
1157
1158         if (unlikely(!idev))
1159                 return ERR_PTR(-ENODEV);
1160
1161         rt = ip6_dst_alloc(net, dev, 0, NULL);
1162         if (unlikely(!rt)) {
1163                 in6_dev_put(idev);
1164                 dst = ERR_PTR(-ENOMEM);
1165                 goto out;
1166         }
1167
1168         if (neigh)
1169                 neigh_hold(neigh);
1170         else {
1171                 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1172                 if (IS_ERR(neigh)) {
1173                         in6_dev_put(idev);
1174                         dst_free(&rt->dst);
1175                         return ERR_CAST(neigh);
1176                 }
1177         }
1178
1179         rt->dst.flags |= DST_HOST;
1180         rt->dst.output  = ip6_output;
1181         dst_set_neighbour(&rt->dst, neigh);
1182         atomic_set(&rt->dst.__refcnt, 1);
1183         rt->rt6i_dst.addr = fl6->daddr;
1184         rt->rt6i_dst.plen = 128;
1185         rt->rt6i_idev     = idev;
1186         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1187
1188         spin_lock_bh(&icmp6_dst_lock);
1189         rt->dst.next = icmp6_dst_gc_list;
1190         icmp6_dst_gc_list = &rt->dst;
1191         spin_unlock_bh(&icmp6_dst_lock);
1192
1193         fib6_force_start_gc(net);
1194
1195         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1196
1197 out:
1198         return dst;
1199 }
1200
1201 int icmp6_dst_gc(void)
1202 {
1203         struct dst_entry *dst, **pprev;
1204         int more = 0;
1205
1206         spin_lock_bh(&icmp6_dst_lock);
1207         pprev = &icmp6_dst_gc_list;
1208
1209         while ((dst = *pprev) != NULL) {
1210                 if (!atomic_read(&dst->__refcnt)) {
1211                         *pprev = dst->next;
1212                         dst_free(dst);
1213                 } else {
1214                         pprev = &dst->next;
1215                         ++more;
1216                 }
1217         }
1218
1219         spin_unlock_bh(&icmp6_dst_lock);
1220
1221         return more;
1222 }
1223
1224 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1225                             void *arg)
1226 {
1227         struct dst_entry *dst, **pprev;
1228
1229         spin_lock_bh(&icmp6_dst_lock);
1230         pprev = &icmp6_dst_gc_list;
1231         while ((dst = *pprev) != NULL) {
1232                 struct rt6_info *rt = (struct rt6_info *) dst;
1233                 if (func(rt, arg)) {
1234                         *pprev = dst->next;
1235                         dst_free(dst);
1236                 } else {
1237                         pprev = &dst->next;
1238                 }
1239         }
1240         spin_unlock_bh(&icmp6_dst_lock);
1241 }
1242
1243 static int ip6_dst_gc(struct dst_ops *ops)
1244 {
1245         unsigned long now = jiffies;
1246         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1247         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1248         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1249         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1250         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1251         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1252         int entries;
1253
1254         entries = dst_entries_get_fast(ops);
1255         if (time_after(rt_last_gc + rt_min_interval, now) &&
1256             entries <= rt_max_size)
1257                 goto out;
1258
1259         net->ipv6.ip6_rt_gc_expire++;
1260         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1261         net->ipv6.ip6_rt_last_gc = now;
1262         entries = dst_entries_get_slow(ops);
1263         if (entries < ops->gc_thresh)
1264                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1265 out:
1266         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1267         return entries > rt_max_size;
1268 }
1269
1270 /* Clean host part of a prefix. Not necessary in radix tree,
1271    but results in cleaner routing tables.
1272
1273    Remove it only when all the things will work!
1274  */
1275
1276 int ip6_dst_hoplimit(struct dst_entry *dst)
1277 {
1278         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1279         if (hoplimit == 0) {
1280                 struct net_device *dev = dst->dev;
1281                 struct inet6_dev *idev;
1282
1283                 rcu_read_lock();
1284                 idev = __in6_dev_get(dev);
1285                 if (idev)
1286                         hoplimit = idev->cnf.hop_limit;
1287                 else
1288                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1289                 rcu_read_unlock();
1290         }
1291         return hoplimit;
1292 }
1293 EXPORT_SYMBOL(ip6_dst_hoplimit);
1294
1295 /*
1296  *
1297  */
1298
1299 int ip6_route_add(struct fib6_config *cfg)
1300 {
1301         int err;
1302         struct net *net = cfg->fc_nlinfo.nl_net;
1303         struct rt6_info *rt = NULL;
1304         struct net_device *dev = NULL;
1305         struct inet6_dev *idev = NULL;
1306         struct fib6_table *table;
1307         int addr_type;
1308
1309         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1310                 return -EINVAL;
1311 #ifndef CONFIG_IPV6_SUBTREES
1312         if (cfg->fc_src_len)
1313                 return -EINVAL;
1314 #endif
1315         if (cfg->fc_ifindex) {
1316                 err = -ENODEV;
1317                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1318                 if (!dev)
1319                         goto out;
1320                 idev = in6_dev_get(dev);
1321                 if (!idev)
1322                         goto out;
1323         }
1324
1325         if (cfg->fc_metric == 0)
1326                 cfg->fc_metric = IP6_RT_PRIO_USER;
1327
1328         err = -ENOBUFS;
1329         if (cfg->fc_nlinfo.nlh &&
1330             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1331                 table = fib6_get_table(net, cfg->fc_table);
1332                 if (!table) {
1333                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1334                         table = fib6_new_table(net, cfg->fc_table);
1335                 }
1336         } else {
1337                 table = fib6_new_table(net, cfg->fc_table);
1338         }
1339
1340         if (!table)
1341                 goto out;
1342
1343         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1344
1345         if (!rt) {
1346                 err = -ENOMEM;
1347                 goto out;
1348         }
1349
1350         rt->dst.obsolete = -1;
1351
1352         if (cfg->fc_flags & RTF_EXPIRES)
1353                 rt6_set_expires(rt, jiffies +
1354                                 clock_t_to_jiffies(cfg->fc_expires));
1355         else
1356                 rt6_clean_expires(rt);
1357
1358         if (cfg->fc_protocol == RTPROT_UNSPEC)
1359                 cfg->fc_protocol = RTPROT_BOOT;
1360         rt->rt6i_protocol = cfg->fc_protocol;
1361
1362         addr_type = ipv6_addr_type(&cfg->fc_dst);
1363
1364         if (addr_type & IPV6_ADDR_MULTICAST)
1365                 rt->dst.input = ip6_mc_input;
1366         else if (cfg->fc_flags & RTF_LOCAL)
1367                 rt->dst.input = ip6_input;
1368         else
1369                 rt->dst.input = ip6_forward;
1370
1371         rt->dst.output = ip6_output;
1372
1373         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1374         rt->rt6i_dst.plen = cfg->fc_dst_len;
1375         if (rt->rt6i_dst.plen == 128)
1376                rt->dst.flags |= DST_HOST;
1377
1378         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1379                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1380                 if (!metrics) {
1381                         err = -ENOMEM;
1382                         goto out;
1383                 }
1384                 dst_init_metrics(&rt->dst, metrics, 0);
1385         }
1386 #ifdef CONFIG_IPV6_SUBTREES
1387         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1388         rt->rt6i_src.plen = cfg->fc_src_len;
1389 #endif
1390
1391         rt->rt6i_metric = cfg->fc_metric;
1392
1393         /* We cannot add true routes via loopback here,
1394            they would result in kernel looping; promote them to reject routes
1395          */
1396         if ((cfg->fc_flags & RTF_REJECT) ||
1397             (dev && (dev->flags & IFF_LOOPBACK) &&
1398              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1399              !(cfg->fc_flags & RTF_LOCAL))) {
1400                 /* hold loopback dev/idev if we haven't done so. */
1401                 if (dev != net->loopback_dev) {
1402                         if (dev) {
1403                                 dev_put(dev);
1404                                 in6_dev_put(idev);
1405                         }
1406                         dev = net->loopback_dev;
1407                         dev_hold(dev);
1408                         idev = in6_dev_get(dev);
1409                         if (!idev) {
1410                                 err = -ENODEV;
1411                                 goto out;
1412                         }
1413                 }
1414                 rt->dst.output = ip6_pkt_discard_out;
1415                 rt->dst.input = ip6_pkt_discard;
1416                 rt->dst.error = -ENETUNREACH;
1417                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1418                 goto install_route;
1419         }
1420
1421         if (cfg->fc_flags & RTF_GATEWAY) {
1422                 const struct in6_addr *gw_addr;
1423                 int gwa_type;
1424
1425                 gw_addr = &cfg->fc_gateway;
1426                 rt->rt6i_gateway = *gw_addr;
1427                 gwa_type = ipv6_addr_type(gw_addr);
1428
1429                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1430                         struct rt6_info *grt;
1431
1432                         /* IPv6 strictly inhibits using not link-local
1433                            addresses as nexthop address.
1434                            Otherwise, router will not able to send redirects.
1435                            It is very good, but in some (rare!) circumstances
1436                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1437                            some exceptions. --ANK
1438                          */
1439                         err = -EINVAL;
1440                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1441                                 goto out;
1442
1443                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1444
1445                         err = -EHOSTUNREACH;
1446                         if (!grt)
1447                                 goto out;
1448                         if (dev) {
1449                                 if (dev != grt->dst.dev) {
1450                                         dst_release(&grt->dst);
1451                                         goto out;
1452                                 }
1453                         } else {
1454                                 dev = grt->dst.dev;
1455                                 idev = grt->rt6i_idev;
1456                                 dev_hold(dev);
1457                                 in6_dev_hold(grt->rt6i_idev);
1458                         }
1459                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1460                                 err = 0;
1461                         dst_release(&grt->dst);
1462
1463                         if (err)
1464                                 goto out;
1465                 }
1466                 err = -EINVAL;
1467                 if (!dev || (dev->flags & IFF_LOOPBACK))
1468                         goto out;
1469         }
1470
1471         err = -ENODEV;
1472         if (!dev)
1473                 goto out;
1474
1475         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1476                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1477                         err = -EINVAL;
1478                         goto out;
1479                 }
1480                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1481                 rt->rt6i_prefsrc.plen = 128;
1482         } else
1483                 rt->rt6i_prefsrc.plen = 0;
1484
1485         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1486                 err = rt6_bind_neighbour(rt, dev);
1487                 if (err)
1488                         goto out;
1489         }
1490
1491         rt->rt6i_flags = cfg->fc_flags;
1492
1493 install_route:
1494         if (cfg->fc_mx) {
1495                 struct nlattr *nla;
1496                 int remaining;
1497
1498                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1499                         int type = nla_type(nla);
1500
1501                         if (type) {
1502                                 if (type > RTAX_MAX) {
1503                                         err = -EINVAL;
1504                                         goto out;
1505                                 }
1506
1507                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1508                         }
1509                 }
1510         }
1511
1512         rt->dst.dev = dev;
1513         rt->rt6i_idev = idev;
1514         rt->rt6i_table = table;
1515
1516         cfg->fc_nlinfo.nl_net = dev_net(dev);
1517
1518         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1519
1520 out:
1521         if (dev)
1522                 dev_put(dev);
1523         if (idev)
1524                 in6_dev_put(idev);
1525         if (rt)
1526                 dst_free(&rt->dst);
1527         return err;
1528 }
1529
1530 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1531 {
1532         int err;
1533         struct fib6_table *table;
1534         struct net *net = dev_net(rt->dst.dev);
1535
1536         if (rt == net->ipv6.ip6_null_entry)
1537                 return -ENOENT;
1538
1539         table = rt->rt6i_table;
1540         write_lock_bh(&table->tb6_lock);
1541
1542         err = fib6_del(rt, info);
1543         dst_release(&rt->dst);
1544
1545         write_unlock_bh(&table->tb6_lock);
1546
1547         return err;
1548 }
1549
1550 int ip6_del_rt(struct rt6_info *rt)
1551 {
1552         struct nl_info info = {
1553                 .nl_net = dev_net(rt->dst.dev),
1554         };
1555         return __ip6_del_rt(rt, &info);
1556 }
1557
1558 static int ip6_route_del(struct fib6_config *cfg)
1559 {
1560         struct fib6_table *table;
1561         struct fib6_node *fn;
1562         struct rt6_info *rt;
1563         int err = -ESRCH;
1564
1565         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1566         if (!table)
1567                 return err;
1568
1569         read_lock_bh(&table->tb6_lock);
1570
1571         fn = fib6_locate(&table->tb6_root,
1572                          &cfg->fc_dst, cfg->fc_dst_len,
1573                          &cfg->fc_src, cfg->fc_src_len);
1574
1575         if (fn) {
1576                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1577                         if (cfg->fc_ifindex &&
1578                             (!rt->dst.dev ||
1579                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1580                                 continue;
1581                         if (cfg->fc_flags & RTF_GATEWAY &&
1582                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1583                                 continue;
1584                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1585                                 continue;
1586                         dst_hold(&rt->dst);
1587                         read_unlock_bh(&table->tb6_lock);
1588
1589                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1590                 }
1591         }
1592         read_unlock_bh(&table->tb6_lock);
1593
1594         return err;
1595 }
1596
1597 /*
1598  *      Handle redirects
1599  */
1600 struct ip6rd_flowi {
1601         struct flowi6 fl6;
1602         struct in6_addr gateway;
1603 };
1604
1605 static struct rt6_info *__ip6_route_redirect(struct net *net,
1606                                              struct fib6_table *table,
1607                                              struct flowi6 *fl6,
1608                                              int flags)
1609 {
1610         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1611         struct rt6_info *rt;
1612         struct fib6_node *fn;
1613
1614         /*
1615          * Get the "current" route for this destination and
1616          * check if the redirect has come from approriate router.
1617          *
1618          * RFC 2461 specifies that redirects should only be
1619          * accepted if they come from the nexthop to the target.
1620          * Due to the way the routes are chosen, this notion
1621          * is a bit fuzzy and one might need to check all possible
1622          * routes.
1623          */
1624
1625         read_lock_bh(&table->tb6_lock);
1626         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1627 restart:
1628         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1629                 /*
1630                  * Current route is on-link; redirect is always invalid.
1631                  *
1632                  * Seems, previous statement is not true. It could
1633                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1634                  * But then router serving it might decide, that we should
1635                  * know truth 8)8) --ANK (980726).
1636                  */
1637                 if (rt6_check_expired(rt))
1638                         continue;
1639                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1640                         continue;
1641                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1642                         continue;
1643                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1644                         continue;
1645                 break;
1646         }
1647
1648         if (!rt)
1649                 rt = net->ipv6.ip6_null_entry;
1650         BACKTRACK(net, &fl6->saddr);
1651 out:
1652         dst_hold(&rt->dst);
1653
1654         read_unlock_bh(&table->tb6_lock);
1655
1656         return rt;
1657 };
1658
1659 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1660                                            const struct in6_addr *src,
1661                                            const struct in6_addr *gateway,
1662                                            struct net_device *dev)
1663 {
1664         int flags = RT6_LOOKUP_F_HAS_SADDR;
1665         struct net *net = dev_net(dev);
1666         struct ip6rd_flowi rdfl = {
1667                 .fl6 = {
1668                         .flowi6_oif = dev->ifindex,
1669                         .daddr = *dest,
1670                         .saddr = *src,
1671                 },
1672         };
1673
1674         rdfl.gateway = *gateway;
1675
1676         if (rt6_need_strict(dest))
1677                 flags |= RT6_LOOKUP_F_IFACE;
1678
1679         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1680                                                    flags, __ip6_route_redirect);
1681 }
1682
1683 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1684                   const struct in6_addr *saddr,
1685                   struct neighbour *neigh, u8 *lladdr, int on_link)
1686 {
1687         struct rt6_info *rt, *nrt = NULL;
1688         struct netevent_redirect netevent;
1689         struct net *net = dev_net(neigh->dev);
1690         struct neighbour *old_neigh;
1691
1692         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1693
1694         if (rt == net->ipv6.ip6_null_entry) {
1695                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1696                 goto out;
1697         }
1698
1699         /*
1700          *      We have finally decided to accept it.
1701          */
1702
1703         neigh_update(neigh, lladdr, NUD_STALE,
1704                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1705                      NEIGH_UPDATE_F_OVERRIDE|
1706                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1707                                      NEIGH_UPDATE_F_ISROUTER))
1708                      );
1709
1710         /*
1711          * Redirect received -> path was valid.
1712          * Look, redirects are sent only in response to data packets,
1713          * so that this nexthop apparently is reachable. --ANK
1714          */
1715         dst_confirm(&rt->dst);
1716
1717         /* Duplicate redirect: silently ignore. */
1718         old_neigh = dst_get_neighbour_noref_raw(&rt->dst);
1719         if (neigh == old_neigh)
1720                 goto out;
1721
1722         nrt = ip6_rt_copy(rt, dest);
1723         if (!nrt)
1724                 goto out;
1725
1726         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1727         if (on_link)
1728                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1729
1730         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1731         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1732
1733         if (ip6_ins_rt(nrt))
1734                 goto out;
1735
1736         netevent.old = &rt->dst;
1737         netevent.old_neigh = old_neigh;
1738         netevent.new = &nrt->dst;
1739         netevent.new_neigh = neigh;
1740         netevent.daddr = dest;
1741         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1742
1743         if (rt->rt6i_flags & RTF_CACHE) {
1744                 ip6_del_rt(rt);
1745                 return;
1746         }
1747
1748 out:
1749         dst_release(&rt->dst);
1750 }
1751
1752 /*
1753  *      Misc support functions
1754  */
1755
1756 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1757                                     const struct in6_addr *dest)
1758 {
1759         struct net *net = dev_net(ort->dst.dev);
1760         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1761                                             ort->rt6i_table);
1762
1763         if (rt) {
1764                 rt->dst.input = ort->dst.input;
1765                 rt->dst.output = ort->dst.output;
1766                 rt->dst.flags |= DST_HOST;
1767
1768                 rt->rt6i_dst.addr = *dest;
1769                 rt->rt6i_dst.plen = 128;
1770                 dst_copy_metrics(&rt->dst, &ort->dst);
1771                 rt->dst.error = ort->dst.error;
1772                 rt->rt6i_idev = ort->rt6i_idev;
1773                 if (rt->rt6i_idev)
1774                         in6_dev_hold(rt->rt6i_idev);
1775                 rt->dst.lastuse = jiffies;
1776
1777                 rt->rt6i_gateway = ort->rt6i_gateway;
1778                 rt->rt6i_flags = ort->rt6i_flags;
1779                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1780                     (RTF_DEFAULT | RTF_ADDRCONF))
1781                         rt6_set_from(rt, ort);
1782                 else
1783                         rt6_clean_expires(rt);
1784                 rt->rt6i_metric = 0;
1785
1786 #ifdef CONFIG_IPV6_SUBTREES
1787                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1788 #endif
1789                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1790                 rt->rt6i_table = ort->rt6i_table;
1791         }
1792         return rt;
1793 }
1794
1795 #ifdef CONFIG_IPV6_ROUTE_INFO
1796 static struct rt6_info *rt6_get_route_info(struct net *net,
1797                                            const struct in6_addr *prefix, int prefixlen,
1798                                            const struct in6_addr *gwaddr, int ifindex)
1799 {
1800         struct fib6_node *fn;
1801         struct rt6_info *rt = NULL;
1802         struct fib6_table *table;
1803
1804         table = fib6_get_table(net, RT6_TABLE_INFO);
1805         if (!table)
1806                 return NULL;
1807
1808         write_lock_bh(&table->tb6_lock);
1809         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1810         if (!fn)
1811                 goto out;
1812
1813         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1814                 if (rt->dst.dev->ifindex != ifindex)
1815                         continue;
1816                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1817                         continue;
1818                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1819                         continue;
1820                 dst_hold(&rt->dst);
1821                 break;
1822         }
1823 out:
1824         write_unlock_bh(&table->tb6_lock);
1825         return rt;
1826 }
1827
1828 static struct rt6_info *rt6_add_route_info(struct net *net,
1829                                            const struct in6_addr *prefix, int prefixlen,
1830                                            const struct in6_addr *gwaddr, int ifindex,
1831                                            unsigned int pref)
1832 {
1833         struct fib6_config cfg = {
1834                 .fc_table       = RT6_TABLE_INFO,
1835                 .fc_metric      = IP6_RT_PRIO_USER,
1836                 .fc_ifindex     = ifindex,
1837                 .fc_dst_len     = prefixlen,
1838                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1839                                   RTF_UP | RTF_PREF(pref),
1840                 .fc_nlinfo.pid = 0,
1841                 .fc_nlinfo.nlh = NULL,
1842                 .fc_nlinfo.nl_net = net,
1843         };
1844
1845         cfg.fc_dst = *prefix;
1846         cfg.fc_gateway = *gwaddr;
1847
1848         /* We should treat it as a default route if prefix length is 0. */
1849         if (!prefixlen)
1850                 cfg.fc_flags |= RTF_DEFAULT;
1851
1852         ip6_route_add(&cfg);
1853
1854         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1855 }
1856 #endif
1857
1858 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1859 {
1860         struct rt6_info *rt;
1861         struct fib6_table *table;
1862
1863         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1864         if (!table)
1865                 return NULL;
1866
1867         write_lock_bh(&table->tb6_lock);
1868         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1869                 if (dev == rt->dst.dev &&
1870                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1871                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1872                         break;
1873         }
1874         if (rt)
1875                 dst_hold(&rt->dst);
1876         write_unlock_bh(&table->tb6_lock);
1877         return rt;
1878 }
1879
1880 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1881                                      struct net_device *dev,
1882                                      unsigned int pref)
1883 {
1884         struct fib6_config cfg = {
1885                 .fc_table       = RT6_TABLE_DFLT,
1886                 .fc_metric      = IP6_RT_PRIO_USER,
1887                 .fc_ifindex     = dev->ifindex,
1888                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1889                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1890                 .fc_nlinfo.pid = 0,
1891                 .fc_nlinfo.nlh = NULL,
1892                 .fc_nlinfo.nl_net = dev_net(dev),
1893         };
1894
1895         cfg.fc_gateway = *gwaddr;
1896
1897         ip6_route_add(&cfg);
1898
1899         return rt6_get_dflt_router(gwaddr, dev);
1900 }
1901
1902 void rt6_purge_dflt_routers(struct net *net)
1903 {
1904         struct rt6_info *rt;
1905         struct fib6_table *table;
1906
1907         /* NOTE: Keep consistent with rt6_get_dflt_router */
1908         table = fib6_get_table(net, RT6_TABLE_DFLT);
1909         if (!table)
1910                 return;
1911
1912 restart:
1913         read_lock_bh(&table->tb6_lock);
1914         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1915                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1916                         dst_hold(&rt->dst);
1917                         read_unlock_bh(&table->tb6_lock);
1918                         ip6_del_rt(rt);
1919                         goto restart;
1920                 }
1921         }
1922         read_unlock_bh(&table->tb6_lock);
1923 }
1924
1925 static void rtmsg_to_fib6_config(struct net *net,
1926                                  struct in6_rtmsg *rtmsg,
1927                                  struct fib6_config *cfg)
1928 {
1929         memset(cfg, 0, sizeof(*cfg));
1930
1931         cfg->fc_table = RT6_TABLE_MAIN;
1932         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1933         cfg->fc_metric = rtmsg->rtmsg_metric;
1934         cfg->fc_expires = rtmsg->rtmsg_info;
1935         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1936         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1937         cfg->fc_flags = rtmsg->rtmsg_flags;
1938
1939         cfg->fc_nlinfo.nl_net = net;
1940
1941         cfg->fc_dst = rtmsg->rtmsg_dst;
1942         cfg->fc_src = rtmsg->rtmsg_src;
1943         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1944 }
1945
1946 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1947 {
1948         struct fib6_config cfg;
1949         struct in6_rtmsg rtmsg;
1950         int err;
1951
1952         switch(cmd) {
1953         case SIOCADDRT:         /* Add a route */
1954         case SIOCDELRT:         /* Delete a route */
1955                 if (!capable(CAP_NET_ADMIN))
1956                         return -EPERM;
1957                 err = copy_from_user(&rtmsg, arg,
1958                                      sizeof(struct in6_rtmsg));
1959                 if (err)
1960                         return -EFAULT;
1961
1962                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1963
1964                 rtnl_lock();
1965                 switch (cmd) {
1966                 case SIOCADDRT:
1967                         err = ip6_route_add(&cfg);
1968                         break;
1969                 case SIOCDELRT:
1970                         err = ip6_route_del(&cfg);
1971                         break;
1972                 default:
1973                         err = -EINVAL;
1974                 }
1975                 rtnl_unlock();
1976
1977                 return err;
1978         }
1979
1980         return -EINVAL;
1981 }
1982
1983 /*
1984  *      Drop the packet on the floor
1985  */
1986
1987 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1988 {
1989         int type;
1990         struct dst_entry *dst = skb_dst(skb);
1991         switch (ipstats_mib_noroutes) {
1992         case IPSTATS_MIB_INNOROUTES:
1993                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1994                 if (type == IPV6_ADDR_ANY) {
1995                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1996                                       IPSTATS_MIB_INADDRERRORS);
1997                         break;
1998                 }
1999                 /* FALLTHROUGH */
2000         case IPSTATS_MIB_OUTNOROUTES:
2001                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2002                               ipstats_mib_noroutes);
2003                 break;
2004         }
2005         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2006         kfree_skb(skb);
2007         return 0;
2008 }
2009
2010 static int ip6_pkt_discard(struct sk_buff *skb)
2011 {
2012         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2013 }
2014
2015 static int ip6_pkt_discard_out(struct sk_buff *skb)
2016 {
2017         skb->dev = skb_dst(skb)->dev;
2018         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2019 }
2020
2021 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2022
2023 static int ip6_pkt_prohibit(struct sk_buff *skb)
2024 {
2025         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2026 }
2027
2028 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2029 {
2030         skb->dev = skb_dst(skb)->dev;
2031         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2032 }
2033
2034 #endif
2035
2036 /*
2037  *      Allocate a dst for local (unicast / anycast) address.
2038  */
2039
2040 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2041                                     const struct in6_addr *addr,
2042                                     bool anycast)
2043 {
2044         struct net *net = dev_net(idev->dev);
2045         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2046         int err;
2047
2048         if (!rt) {
2049                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2050                 return ERR_PTR(-ENOMEM);
2051         }
2052
2053         in6_dev_hold(idev);
2054
2055         rt->dst.flags |= DST_HOST;
2056         rt->dst.input = ip6_input;
2057         rt->dst.output = ip6_output;
2058         rt->rt6i_idev = idev;
2059         rt->dst.obsolete = -1;
2060
2061         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2062         if (anycast)
2063                 rt->rt6i_flags |= RTF_ANYCAST;
2064         else
2065                 rt->rt6i_flags |= RTF_LOCAL;
2066         err = rt6_bind_neighbour(rt, rt->dst.dev);
2067         if (err) {
2068                 dst_free(&rt->dst);
2069                 return ERR_PTR(err);
2070         }
2071
2072         rt->rt6i_dst.addr = *addr;
2073         rt->rt6i_dst.plen = 128;
2074         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2075
2076         atomic_set(&rt->dst.__refcnt, 1);
2077
2078         return rt;
2079 }
2080
2081 int ip6_route_get_saddr(struct net *net,
2082                         struct rt6_info *rt,
2083                         const struct in6_addr *daddr,
2084                         unsigned int prefs,
2085                         struct in6_addr *saddr)
2086 {
2087         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2088         int err = 0;
2089         if (rt->rt6i_prefsrc.plen)
2090                 *saddr = rt->rt6i_prefsrc.addr;
2091         else
2092                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2093                                          daddr, prefs, saddr);
2094         return err;
2095 }
2096
2097 /* remove deleted ip from prefsrc entries */
2098 struct arg_dev_net_ip {
2099         struct net_device *dev;
2100         struct net *net;
2101         struct in6_addr *addr;
2102 };
2103
2104 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2105 {
2106         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2107         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2108         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2109
2110         if (((void *)rt->dst.dev == dev || !dev) &&
2111             rt != net->ipv6.ip6_null_entry &&
2112             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2113                 /* remove prefsrc entry */
2114                 rt->rt6i_prefsrc.plen = 0;
2115         }
2116         return 0;
2117 }
2118
2119 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2120 {
2121         struct net *net = dev_net(ifp->idev->dev);
2122         struct arg_dev_net_ip adni = {
2123                 .dev = ifp->idev->dev,
2124                 .net = net,
2125                 .addr = &ifp->addr,
2126         };
2127         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2128 }
2129
2130 struct arg_dev_net {
2131         struct net_device *dev;
2132         struct net *net;
2133 };
2134
2135 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2136 {
2137         const struct arg_dev_net *adn = arg;
2138         const struct net_device *dev = adn->dev;
2139
2140         if ((rt->dst.dev == dev || !dev) &&
2141             rt != adn->net->ipv6.ip6_null_entry)
2142                 return -1;
2143
2144         return 0;
2145 }
2146
2147 void rt6_ifdown(struct net *net, struct net_device *dev)
2148 {
2149         struct arg_dev_net adn = {
2150                 .dev = dev,
2151                 .net = net,
2152         };
2153
2154         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2155         icmp6_clean_all(fib6_ifdown, &adn);
2156 }
2157
2158 struct rt6_mtu_change_arg {
2159         struct net_device *dev;
2160         unsigned int mtu;
2161 };
2162
2163 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2164 {
2165         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2166         struct inet6_dev *idev;
2167
2168         /* In IPv6 pmtu discovery is not optional,
2169            so that RTAX_MTU lock cannot disable it.
2170            We still use this lock to block changes
2171            caused by addrconf/ndisc.
2172         */
2173
2174         idev = __in6_dev_get(arg->dev);
2175         if (!idev)
2176                 return 0;
2177
2178         /* For administrative MTU increase, there is no way to discover
2179            IPv6 PMTU increase, so PMTU increase should be updated here.
2180            Since RFC 1981 doesn't include administrative MTU increase
2181            update PMTU increase is a MUST. (i.e. jumbo frame)
2182          */
2183         /*
2184            If new MTU is less than route PMTU, this new MTU will be the
2185            lowest MTU in the path, update the route PMTU to reflect PMTU
2186            decreases; if new MTU is greater than route PMTU, and the
2187            old MTU is the lowest MTU in the path, update the route PMTU
2188            to reflect the increase. In this case if the other nodes' MTU
2189            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2190            PMTU discouvery.
2191          */
2192         if (rt->dst.dev == arg->dev &&
2193             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2194             (dst_mtu(&rt->dst) >= arg->mtu ||
2195              (dst_mtu(&rt->dst) < arg->mtu &&
2196               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2197                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2198         }
2199         return 0;
2200 }
2201
2202 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2203 {
2204         struct rt6_mtu_change_arg arg = {
2205                 .dev = dev,
2206                 .mtu = mtu,
2207         };
2208
2209         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2210 }
2211
2212 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2213         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2214         [RTA_OIF]               = { .type = NLA_U32 },
2215         [RTA_IIF]               = { .type = NLA_U32 },
2216         [RTA_PRIORITY]          = { .type = NLA_U32 },
2217         [RTA_METRICS]           = { .type = NLA_NESTED },
2218 };
2219
2220 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2221                               struct fib6_config *cfg)
2222 {
2223         struct rtmsg *rtm;
2224         struct nlattr *tb[RTA_MAX+1];
2225         int err;
2226
2227         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2228         if (err < 0)
2229                 goto errout;
2230
2231         err = -EINVAL;
2232         rtm = nlmsg_data(nlh);
2233         memset(cfg, 0, sizeof(*cfg));
2234
2235         cfg->fc_table = rtm->rtm_table;
2236         cfg->fc_dst_len = rtm->rtm_dst_len;
2237         cfg->fc_src_len = rtm->rtm_src_len;
2238         cfg->fc_flags = RTF_UP;
2239         cfg->fc_protocol = rtm->rtm_protocol;
2240
2241         if (rtm->rtm_type == RTN_UNREACHABLE)
2242                 cfg->fc_flags |= RTF_REJECT;
2243
2244         if (rtm->rtm_type == RTN_LOCAL)
2245                 cfg->fc_flags |= RTF_LOCAL;
2246
2247         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2248         cfg->fc_nlinfo.nlh = nlh;
2249         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2250
2251         if (tb[RTA_GATEWAY]) {
2252                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2253                 cfg->fc_flags |= RTF_GATEWAY;
2254         }
2255
2256         if (tb[RTA_DST]) {
2257                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2258
2259                 if (nla_len(tb[RTA_DST]) < plen)
2260                         goto errout;
2261
2262                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2263         }
2264
2265         if (tb[RTA_SRC]) {
2266                 int plen = (rtm->rtm_src_len + 7) >> 3;
2267
2268                 if (nla_len(tb[RTA_SRC]) < plen)
2269                         goto errout;
2270
2271                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2272         }
2273
2274         if (tb[RTA_PREFSRC])
2275                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2276
2277         if (tb[RTA_OIF])
2278                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2279
2280         if (tb[RTA_PRIORITY])
2281                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2282
2283         if (tb[RTA_METRICS]) {
2284                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2285                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2286         }
2287
2288         if (tb[RTA_TABLE])
2289                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2290
2291         err = 0;
2292 errout:
2293         return err;
2294 }
2295
2296 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2297 {
2298         struct fib6_config cfg;
2299         int err;
2300
2301         err = rtm_to_fib6_config(skb, nlh, &cfg);
2302         if (err < 0)
2303                 return err;
2304
2305         return ip6_route_del(&cfg);
2306 }
2307
2308 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2309 {
2310         struct fib6_config cfg;
2311         int err;
2312
2313         err = rtm_to_fib6_config(skb, nlh, &cfg);
2314         if (err < 0)
2315                 return err;
2316
2317         return ip6_route_add(&cfg);
2318 }
2319
2320 static inline size_t rt6_nlmsg_size(void)
2321 {
2322         return NLMSG_ALIGN(sizeof(struct rtmsg))
2323                + nla_total_size(16) /* RTA_SRC */
2324                + nla_total_size(16) /* RTA_DST */
2325                + nla_total_size(16) /* RTA_GATEWAY */
2326                + nla_total_size(16) /* RTA_PREFSRC */
2327                + nla_total_size(4) /* RTA_TABLE */
2328                + nla_total_size(4) /* RTA_IIF */
2329                + nla_total_size(4) /* RTA_OIF */
2330                + nla_total_size(4) /* RTA_PRIORITY */
2331                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2332                + nla_total_size(sizeof(struct rta_cacheinfo));
2333 }
2334
2335 static int rt6_fill_node(struct net *net,
2336                          struct sk_buff *skb, struct rt6_info *rt,
2337                          struct in6_addr *dst, struct in6_addr *src,
2338                          int iif, int type, u32 pid, u32 seq,
2339                          int prefix, int nowait, unsigned int flags)
2340 {
2341         const struct inet_peer *peer;
2342         struct rtmsg *rtm;
2343         struct nlmsghdr *nlh;
2344         long expires;
2345         u32 table;
2346         struct neighbour *n;
2347         u32 ts, tsage;
2348
2349         if (prefix) {   /* user wants prefix routes only */
2350                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2351                         /* success since this is not a prefix route */
2352                         return 1;
2353                 }
2354         }
2355
2356         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2357         if (!nlh)
2358                 return -EMSGSIZE;
2359
2360         rtm = nlmsg_data(nlh);
2361         rtm->rtm_family = AF_INET6;
2362         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2363         rtm->rtm_src_len = rt->rt6i_src.plen;
2364         rtm->rtm_tos = 0;
2365         if (rt->rt6i_table)
2366                 table = rt->rt6i_table->tb6_id;
2367         else
2368                 table = RT6_TABLE_UNSPEC;
2369         rtm->rtm_table = table;
2370         if (nla_put_u32(skb, RTA_TABLE, table))
2371                 goto nla_put_failure;
2372         if (rt->rt6i_flags & RTF_REJECT)
2373                 rtm->rtm_type = RTN_UNREACHABLE;
2374         else if (rt->rt6i_flags & RTF_LOCAL)
2375                 rtm->rtm_type = RTN_LOCAL;
2376         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2377                 rtm->rtm_type = RTN_LOCAL;
2378         else
2379                 rtm->rtm_type = RTN_UNICAST;
2380         rtm->rtm_flags = 0;
2381         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2382         rtm->rtm_protocol = rt->rt6i_protocol;
2383         if (rt->rt6i_flags & RTF_DYNAMIC)
2384                 rtm->rtm_protocol = RTPROT_REDIRECT;
2385         else if (rt->rt6i_flags & RTF_ADDRCONF)
2386                 rtm->rtm_protocol = RTPROT_KERNEL;
2387         else if (rt->rt6i_flags & RTF_DEFAULT)
2388                 rtm->rtm_protocol = RTPROT_RA;
2389
2390         if (rt->rt6i_flags & RTF_CACHE)
2391                 rtm->rtm_flags |= RTM_F_CLONED;
2392
2393         if (dst) {
2394                 if (nla_put(skb, RTA_DST, 16, dst))
2395                         goto nla_put_failure;
2396                 rtm->rtm_dst_len = 128;
2397         } else if (rtm->rtm_dst_len)
2398                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2399                         goto nla_put_failure;
2400 #ifdef CONFIG_IPV6_SUBTREES
2401         if (src) {
2402                 if (nla_put(skb, RTA_SRC, 16, src))
2403                         goto nla_put_failure;
2404                 rtm->rtm_src_len = 128;
2405         } else if (rtm->rtm_src_len &&
2406                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2407                 goto nla_put_failure;
2408 #endif
2409         if (iif) {
2410 #ifdef CONFIG_IPV6_MROUTE
2411                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2412                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2413                         if (err <= 0) {
2414                                 if (!nowait) {
2415                                         if (err == 0)
2416                                                 return 0;
2417                                         goto nla_put_failure;
2418                                 } else {
2419                                         if (err == -EMSGSIZE)
2420                                                 goto nla_put_failure;
2421                                 }
2422                         }
2423                 } else
2424 #endif
2425                         if (nla_put_u32(skb, RTA_IIF, iif))
2426                                 goto nla_put_failure;
2427         } else if (dst) {
2428                 struct in6_addr saddr_buf;
2429                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2430                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2431                         goto nla_put_failure;
2432         }
2433
2434         if (rt->rt6i_prefsrc.plen) {
2435                 struct in6_addr saddr_buf;
2436                 saddr_buf = rt->rt6i_prefsrc.addr;
2437                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2438                         goto nla_put_failure;
2439         }
2440
2441         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2442                 goto nla_put_failure;
2443
2444         rcu_read_lock();
2445         n = dst_get_neighbour_noref(&rt->dst);
2446         if (n) {
2447                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2448                         rcu_read_unlock();
2449                         goto nla_put_failure;
2450                 }
2451         }
2452         rcu_read_unlock();
2453
2454         if (rt->dst.dev &&
2455             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2456                 goto nla_put_failure;
2457         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2458                 goto nla_put_failure;
2459         if (!(rt->rt6i_flags & RTF_EXPIRES))
2460                 expires = 0;
2461         else if (rt->dst.expires - jiffies < INT_MAX)
2462                 expires = rt->dst.expires - jiffies;
2463         else
2464                 expires = INT_MAX;
2465
2466         peer = NULL;
2467         if (rt6_has_peer(rt))
2468                 peer = rt6_peer_ptr(rt);
2469         ts = tsage = 0;
2470         if (peer && peer->tcp_ts_stamp) {
2471                 ts = peer->tcp_ts;
2472                 tsage = get_seconds() - peer->tcp_ts_stamp;
2473         }
2474
2475         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2476                                expires, rt->dst.error) < 0)
2477                 goto nla_put_failure;
2478
2479         return nlmsg_end(skb, nlh);
2480
2481 nla_put_failure:
2482         nlmsg_cancel(skb, nlh);
2483         return -EMSGSIZE;
2484 }
2485
2486 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2487 {
2488         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2489         int prefix;
2490
2491         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2492                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2493                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2494         } else
2495                 prefix = 0;
2496
2497         return rt6_fill_node(arg->net,
2498                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2499                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2500                      prefix, 0, NLM_F_MULTI);
2501 }
2502
2503 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2504 {
2505         struct net *net = sock_net(in_skb->sk);
2506         struct nlattr *tb[RTA_MAX+1];
2507         struct rt6_info *rt;
2508         struct sk_buff *skb;
2509         struct rtmsg *rtm;
2510         struct flowi6 fl6;
2511         int err, iif = 0, oif = 0;
2512
2513         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2514         if (err < 0)
2515                 goto errout;
2516
2517         err = -EINVAL;
2518         memset(&fl6, 0, sizeof(fl6));
2519
2520         if (tb[RTA_SRC]) {
2521                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2522                         goto errout;
2523
2524                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2525         }
2526
2527         if (tb[RTA_DST]) {
2528                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2529                         goto errout;
2530
2531                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2532         }
2533
2534         if (tb[RTA_IIF])
2535                 iif = nla_get_u32(tb[RTA_IIF]);
2536
2537         if (tb[RTA_OIF])
2538                 oif = nla_get_u32(tb[RTA_OIF]);
2539
2540         if (iif) {
2541                 struct net_device *dev;
2542                 int flags = 0;
2543
2544                 dev = __dev_get_by_index(net, iif);
2545                 if (!dev) {
2546                         err = -ENODEV;
2547                         goto errout;
2548                 }
2549
2550                 fl6.flowi6_iif = iif;
2551
2552                 if (!ipv6_addr_any(&fl6.saddr))
2553                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2554
2555                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2556                                                                flags);
2557         } else {
2558                 fl6.flowi6_oif = oif;
2559
2560                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2561         }
2562
2563         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2564         if (!skb) {
2565                 dst_release(&rt->dst);
2566                 err = -ENOBUFS;
2567                 goto errout;
2568         }
2569
2570         /* Reserve room for dummy headers, this skb can pass
2571            through good chunk of routing engine.
2572          */
2573         skb_reset_mac_header(skb);
2574         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2575
2576         skb_dst_set(skb, &rt->dst);
2577
2578         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2579                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2580                             nlh->nlmsg_seq, 0, 0, 0);
2581         if (err < 0) {
2582                 kfree_skb(skb);
2583                 goto errout;
2584         }
2585
2586         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2587 errout:
2588         return err;
2589 }
2590
2591 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2592 {
2593         struct sk_buff *skb;
2594         struct net *net = info->nl_net;
2595         u32 seq;
2596         int err;
2597
2598         err = -ENOBUFS;
2599         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2600
2601         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2602         if (!skb)
2603                 goto errout;
2604
2605         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2606                                 event, info->pid, seq, 0, 0, 0);
2607         if (err < 0) {
2608                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2609                 WARN_ON(err == -EMSGSIZE);
2610                 kfree_skb(skb);
2611                 goto errout;
2612         }
2613         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2614                     info->nlh, gfp_any());
2615         return;
2616 errout:
2617         if (err < 0)
2618                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2619 }
2620
2621 static int ip6_route_dev_notify(struct notifier_block *this,
2622                                 unsigned long event, void *data)
2623 {
2624         struct net_device *dev = (struct net_device *)data;
2625         struct net *net = dev_net(dev);
2626
2627         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2628                 net->ipv6.ip6_null_entry->dst.dev = dev;
2629                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2630 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2631                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2632                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2633                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2634                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2635 #endif
2636         }
2637
2638         return NOTIFY_OK;
2639 }
2640
2641 /*
2642  *      /proc
2643  */
2644
2645 #ifdef CONFIG_PROC_FS
2646
2647 struct rt6_proc_arg
2648 {
2649         char *buffer;
2650         int offset;
2651         int length;
2652         int skip;
2653         int len;
2654 };
2655
2656 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2657 {
2658         struct seq_file *m = p_arg;
2659         struct neighbour *n;
2660
2661         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2662
2663 #ifdef CONFIG_IPV6_SUBTREES
2664         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2665 #else
2666         seq_puts(m, "00000000000000000000000000000000 00 ");
2667 #endif
2668         rcu_read_lock();
2669         n = dst_get_neighbour_noref(&rt->dst);
2670         if (n) {
2671                 seq_printf(m, "%pi6", n->primary_key);
2672         } else {
2673                 seq_puts(m, "00000000000000000000000000000000");
2674         }
2675         rcu_read_unlock();
2676         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2677                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2678                    rt->dst.__use, rt->rt6i_flags,
2679                    rt->dst.dev ? rt->dst.dev->name : "");
2680         return 0;
2681 }
2682
2683 static int ipv6_route_show(struct seq_file *m, void *v)
2684 {
2685         struct net *net = (struct net *)m->private;
2686         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2687         return 0;
2688 }
2689
2690 static int ipv6_route_open(struct inode *inode, struct file *file)
2691 {
2692         return single_open_net(inode, file, ipv6_route_show);
2693 }
2694
2695 static const struct file_operations ipv6_route_proc_fops = {
2696         .owner          = THIS_MODULE,
2697         .open           = ipv6_route_open,
2698         .read           = seq_read,
2699         .llseek         = seq_lseek,
2700         .release        = single_release_net,
2701 };
2702
2703 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2704 {
2705         struct net *net = (struct net *)seq->private;
2706         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2707                    net->ipv6.rt6_stats->fib_nodes,
2708                    net->ipv6.rt6_stats->fib_route_nodes,
2709                    net->ipv6.rt6_stats->fib_rt_alloc,
2710                    net->ipv6.rt6_stats->fib_rt_entries,
2711                    net->ipv6.rt6_stats->fib_rt_cache,
2712                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2713                    net->ipv6.rt6_stats->fib_discarded_routes);
2714
2715         return 0;
2716 }
2717
2718 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2719 {
2720         return single_open_net(inode, file, rt6_stats_seq_show);
2721 }
2722
2723 static const struct file_operations rt6_stats_seq_fops = {
2724         .owner   = THIS_MODULE,
2725         .open    = rt6_stats_seq_open,
2726         .read    = seq_read,
2727         .llseek  = seq_lseek,
2728         .release = single_release_net,
2729 };
2730 #endif  /* CONFIG_PROC_FS */
2731
2732 #ifdef CONFIG_SYSCTL
2733
2734 static
2735 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2736                               void __user *buffer, size_t *lenp, loff_t *ppos)
2737 {
2738         struct net *net;
2739         int delay;
2740         if (!write)
2741                 return -EINVAL;
2742
2743         net = (struct net *)ctl->extra1;
2744         delay = net->ipv6.sysctl.flush_delay;
2745         proc_dointvec(ctl, write, buffer, lenp, ppos);
2746         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2747         return 0;
2748 }
2749
2750 ctl_table ipv6_route_table_template[] = {
2751         {
2752                 .procname       =       "flush",
2753                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2754                 .maxlen         =       sizeof(int),
2755                 .mode           =       0200,
2756                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2757         },
2758         {
2759                 .procname       =       "gc_thresh",
2760                 .data           =       &ip6_dst_ops_template.gc_thresh,
2761                 .maxlen         =       sizeof(int),
2762                 .mode           =       0644,
2763                 .proc_handler   =       proc_dointvec,
2764         },
2765         {
2766                 .procname       =       "max_size",
2767                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2768                 .maxlen         =       sizeof(int),
2769                 .mode           =       0644,
2770                 .proc_handler   =       proc_dointvec,
2771         },
2772         {
2773                 .procname       =       "gc_min_interval",
2774                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2775                 .maxlen         =       sizeof(int),
2776                 .mode           =       0644,
2777                 .proc_handler   =       proc_dointvec_jiffies,
2778         },
2779         {
2780                 .procname       =       "gc_timeout",
2781                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2782                 .maxlen         =       sizeof(int),
2783                 .mode           =       0644,
2784                 .proc_handler   =       proc_dointvec_jiffies,
2785         },
2786         {
2787                 .procname       =       "gc_interval",
2788                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2789                 .maxlen         =       sizeof(int),
2790                 .mode           =       0644,
2791                 .proc_handler   =       proc_dointvec_jiffies,
2792         },
2793         {
2794                 .procname       =       "gc_elasticity",
2795                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2796                 .maxlen         =       sizeof(int),
2797                 .mode           =       0644,
2798                 .proc_handler   =       proc_dointvec,
2799         },
2800         {
2801                 .procname       =       "mtu_expires",
2802                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2803                 .maxlen         =       sizeof(int),
2804                 .mode           =       0644,
2805                 .proc_handler   =       proc_dointvec_jiffies,
2806         },
2807         {
2808                 .procname       =       "min_adv_mss",
2809                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2810                 .maxlen         =       sizeof(int),
2811                 .mode           =       0644,
2812                 .proc_handler   =       proc_dointvec,
2813         },
2814         {
2815                 .procname       =       "gc_min_interval_ms",
2816                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2817                 .maxlen         =       sizeof(int),
2818                 .mode           =       0644,
2819                 .proc_handler   =       proc_dointvec_ms_jiffies,
2820         },
2821         { }
2822 };
2823
2824 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2825 {
2826         struct ctl_table *table;
2827
2828         table = kmemdup(ipv6_route_table_template,
2829                         sizeof(ipv6_route_table_template),
2830                         GFP_KERNEL);
2831
2832         if (table) {
2833                 table[0].data = &net->ipv6.sysctl.flush_delay;
2834                 table[0].extra1 = net;
2835                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2836                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2837                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2838                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2839                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2840                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2841                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2842                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2843                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2844         }
2845
2846         return table;
2847 }
2848 #endif
2849
2850 static int __net_init ip6_route_net_init(struct net *net)
2851 {
2852         int ret = -ENOMEM;
2853
2854         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2855                sizeof(net->ipv6.ip6_dst_ops));
2856
2857         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2858                 goto out_ip6_dst_ops;
2859
2860         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2861                                            sizeof(*net->ipv6.ip6_null_entry),
2862                                            GFP_KERNEL);
2863         if (!net->ipv6.ip6_null_entry)
2864                 goto out_ip6_dst_entries;
2865         net->ipv6.ip6_null_entry->dst.path =
2866                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2867         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2868         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2869                          ip6_template_metrics, true);
2870
2871 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2872         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2873                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2874                                                GFP_KERNEL);
2875         if (!net->ipv6.ip6_prohibit_entry)
2876                 goto out_ip6_null_entry;
2877         net->ipv6.ip6_prohibit_entry->dst.path =
2878                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2879         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2880         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2881                          ip6_template_metrics, true);
2882
2883         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2884                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2885                                                GFP_KERNEL);
2886         if (!net->ipv6.ip6_blk_hole_entry)
2887                 goto out_ip6_prohibit_entry;
2888         net->ipv6.ip6_blk_hole_entry->dst.path =
2889                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2890         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2891         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2892                          ip6_template_metrics, true);
2893 #endif
2894
2895         net->ipv6.sysctl.flush_delay = 0;
2896         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2897         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2898         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2899         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2900         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2901         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2902         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2903
2904         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2905
2906         ret = 0;
2907 out:
2908         return ret;
2909
2910 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2911 out_ip6_prohibit_entry:
2912         kfree(net->ipv6.ip6_prohibit_entry);
2913 out_ip6_null_entry:
2914         kfree(net->ipv6.ip6_null_entry);
2915 #endif
2916 out_ip6_dst_entries:
2917         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2918 out_ip6_dst_ops:
2919         goto out;
2920 }
2921
2922 static void __net_exit ip6_route_net_exit(struct net *net)
2923 {
2924         kfree(net->ipv6.ip6_null_entry);
2925 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2926         kfree(net->ipv6.ip6_prohibit_entry);
2927         kfree(net->ipv6.ip6_blk_hole_entry);
2928 #endif
2929         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2930 }
2931
2932 static int __net_init ip6_route_net_init_late(struct net *net)
2933 {
2934 #ifdef CONFIG_PROC_FS
2935         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2936         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2937 #endif
2938         return 0;
2939 }
2940
2941 static void __net_exit ip6_route_net_exit_late(struct net *net)
2942 {
2943 #ifdef CONFIG_PROC_FS
2944         proc_net_remove(net, "ipv6_route");
2945         proc_net_remove(net, "rt6_stats");
2946 #endif
2947 }
2948
2949 static struct pernet_operations ip6_route_net_ops = {
2950         .init = ip6_route_net_init,
2951         .exit = ip6_route_net_exit,
2952 };
2953
2954 static int __net_init ipv6_inetpeer_init(struct net *net)
2955 {
2956         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2957
2958         if (!bp)
2959                 return -ENOMEM;
2960         inet_peer_base_init(bp);
2961         net->ipv6.peers = bp;
2962         return 0;
2963 }
2964
2965 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2966 {
2967         struct inet_peer_base *bp = net->ipv6.peers;
2968
2969         net->ipv6.peers = NULL;
2970         inetpeer_invalidate_tree(bp);
2971         kfree(bp);
2972 }
2973
2974 static struct pernet_operations ipv6_inetpeer_ops = {
2975         .init   =       ipv6_inetpeer_init,
2976         .exit   =       ipv6_inetpeer_exit,
2977 };
2978
2979 static struct pernet_operations ip6_route_net_late_ops = {
2980         .init = ip6_route_net_init_late,
2981         .exit = ip6_route_net_exit_late,
2982 };
2983
2984 static struct notifier_block ip6_route_dev_notifier = {
2985         .notifier_call = ip6_route_dev_notify,
2986         .priority = 0,
2987 };
2988
2989 int __init ip6_route_init(void)
2990 {
2991         int ret;
2992
2993         ret = -ENOMEM;
2994         ip6_dst_ops_template.kmem_cachep =
2995                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2996                                   SLAB_HWCACHE_ALIGN, NULL);
2997         if (!ip6_dst_ops_template.kmem_cachep)
2998                 goto out;
2999
3000         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3001         if (ret)
3002                 goto out_kmem_cache;
3003
3004         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3005         if (ret)
3006                 goto out_dst_entries;
3007
3008         ret = register_pernet_subsys(&ip6_route_net_ops);
3009         if (ret)
3010                 goto out_register_inetpeer;
3011
3012         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3013
3014         /* Registering of the loopback is done before this portion of code,
3015          * the loopback reference in rt6_info will not be taken, do it
3016          * manually for init_net */
3017         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3018         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3019   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3020         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3021         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3022         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3023         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3024   #endif
3025         ret = fib6_init();
3026         if (ret)
3027                 goto out_register_subsys;
3028
3029         ret = xfrm6_init();
3030         if (ret)
3031                 goto out_fib6_init;
3032
3033         ret = fib6_rules_init();
3034         if (ret)
3035                 goto xfrm6_init;
3036
3037         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3038         if (ret)
3039                 goto fib6_rules_init;
3040
3041         ret = -ENOBUFS;
3042         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3043             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3044             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3045                 goto out_register_late_subsys;
3046
3047         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3048         if (ret)
3049                 goto out_register_late_subsys;
3050
3051 out:
3052         return ret;
3053
3054 out_register_late_subsys:
3055         unregister_pernet_subsys(&ip6_route_net_late_ops);
3056 fib6_rules_init:
3057         fib6_rules_cleanup();
3058 xfrm6_init:
3059         xfrm6_fini();
3060 out_fib6_init:
3061         fib6_gc_cleanup();
3062 out_register_subsys:
3063         unregister_pernet_subsys(&ip6_route_net_ops);
3064 out_register_inetpeer:
3065         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3066 out_dst_entries:
3067         dst_entries_destroy(&ip6_dst_blackhole_ops);
3068 out_kmem_cache:
3069         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3070         goto out;
3071 }
3072
3073 void ip6_route_cleanup(void)
3074 {
3075         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3076         unregister_pernet_subsys(&ip6_route_net_late_ops);
3077         fib6_rules_cleanup();
3078         xfrm6_fini();
3079         fib6_gc_cleanup();
3080         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3081         unregister_pernet_subsys(&ip6_route_net_ops);
3082         dst_entries_destroy(&ip6_dst_blackhole_ops);
3083         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3084 }