2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/nexthop.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/lwtunnel.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
111 #include <linux/sysctl.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #include "fib_lookup.h"
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
122 #define RT_GC_TIMEOUT (300*HZ)
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void ipv4_link_failure(struct sk_buff *skb);
145 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb, u32 mtu);
147 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb);
149 static void ipv4_dst_destroy(struct dst_entry *dst);
151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
162 static struct dst_ops ipv4_dst_ops = {
164 .check = ipv4_dst_check,
165 .default_advmss = ipv4_default_advmss,
167 .cow_metrics = ipv4_cow_metrics,
168 .destroy = ipv4_dst_destroy,
169 .negative_advice = ipv4_negative_advice,
170 .link_failure = ipv4_link_failure,
171 .update_pmtu = ip_rt_update_pmtu,
172 .redirect = ip_do_redirect,
173 .local_out = __ip_local_out,
174 .neigh_lookup = ipv4_neigh_lookup,
175 .confirm_neigh = ipv4_confirm_neigh,
178 #define ECN_OR_COST(class) TC_PRIO_##class
180 const __u8 ip_tos2prio[16] = {
182 ECN_OR_COST(BESTEFFORT),
184 ECN_OR_COST(BESTEFFORT),
190 ECN_OR_COST(INTERACTIVE),
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK)
198 EXPORT_SYMBOL(ip_tos2prio);
200 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
201 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
203 #ifdef CONFIG_PROC_FS
204 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
208 return SEQ_START_TOKEN;
211 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
217 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
221 static int rt_cache_seq_show(struct seq_file *seq, void *v)
223 if (v == SEQ_START_TOKEN)
224 seq_printf(seq, "%-127s\n",
225 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
226 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
231 static const struct seq_operations rt_cache_seq_ops = {
232 .start = rt_cache_seq_start,
233 .next = rt_cache_seq_next,
234 .stop = rt_cache_seq_stop,
235 .show = rt_cache_seq_show,
238 static int rt_cache_seq_open(struct inode *inode, struct file *file)
240 return seq_open(file, &rt_cache_seq_ops);
243 static const struct file_operations rt_cache_seq_fops = {
244 .open = rt_cache_seq_open,
247 .release = seq_release,
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
256 return SEQ_START_TOKEN;
258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259 if (!cpu_possible(cpu))
262 return &per_cpu(rt_cache_stat, cpu);
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272 if (!cpu_possible(cpu))
275 return &per_cpu(rt_cache_stat, cpu);
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
288 struct rt_cache_stat *st = v;
290 if (v == SEQ_START_TOKEN) {
291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 dst_entries_get_slow(&ipv4_dst_ops),
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
320 static const struct seq_operations rt_cpu_seq_ops = {
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
330 return seq_open(file, &rt_cpu_seq_ops);
333 static const struct file_operations rt_cpu_seq_fops = {
334 .open = rt_cpu_seq_open,
337 .release = seq_release,
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 struct ip_rt_acct *dst, *src;
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
366 static int __net_init ip_rt_do_proc_init(struct net *net)
368 struct proc_dir_entry *pde;
370 pde = proc_create("rt_cache", 0444, net->proc_net,
375 pde = proc_create("rt_cache", 0444,
376 net->proc_net_stat, &rt_cpu_seq_fops);
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381 pde = proc_create_single("rt_acct", 0, net->proc_net,
388 #ifdef CONFIG_IP_ROUTE_CLASSID
390 remove_proc_entry("rt_cache", net->proc_net_stat);
393 remove_proc_entry("rt_cache", net->proc_net);
398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
400 remove_proc_entry("rt_cache", net->proc_net_stat);
401 remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403 remove_proc_entry("rt_acct", net->proc_net);
407 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
408 .init = ip_rt_do_proc_init,
409 .exit = ip_rt_do_proc_exit,
412 static int __init ip_rt_proc_init(void)
414 return register_pernet_subsys(&ip_rt_proc_ops);
418 static inline int ip_rt_proc_init(void)
422 #endif /* CONFIG_PROC_FS */
424 static inline bool rt_is_expired(const struct rtable *rth)
426 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
429 void rt_cache_flush(struct net *net)
431 rt_genid_bump_ipv4(net);
434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
438 const struct rtable *rt = container_of(dst, struct rtable, dst);
439 struct net_device *dev = dst->dev;
444 if (likely(rt->rt_gw_family == AF_INET)) {
445 n = ip_neigh_gw4(dev, rt->rt_gw4);
446 } else if (rt->rt_gw_family == AF_INET6) {
447 n = ip_neigh_gw6(dev, &rt->rt_gw6);
451 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452 n = ip_neigh_gw4(dev, pkey);
455 if (n && !refcount_inc_not_zero(&n->refcnt))
458 rcu_read_unlock_bh();
463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
465 const struct rtable *rt = container_of(dst, struct rtable, dst);
466 struct net_device *dev = dst->dev;
467 const __be32 *pkey = daddr;
469 if (rt->rt_gw_family == AF_INET) {
470 pkey = (const __be32 *)&rt->rt_gw4;
471 } else if (rt->rt_gw_family == AF_INET6) {
472 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
475 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
478 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481 #define IP_IDENTS_SZ 2048u
483 static atomic_t *ip_idents __read_mostly;
484 static u32 *ip_tstamps __read_mostly;
486 /* In order to protect privacy, we add a perturbation to identifiers
487 * if one generator is seldom used. This makes hard for an attacker
488 * to infer how many packets were sent between two points in time.
490 u32 ip_idents_reserve(u32 hash, int segs)
492 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
493 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
494 u32 old = READ_ONCE(*p_tstamp);
495 u32 now = (u32)jiffies;
498 if (old != now && cmpxchg(p_tstamp, old, now) == old)
499 delta = prandom_u32_max(now - old);
501 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
503 old = (u32)atomic_read(p_id);
504 new = old + delta + segs;
505 } while (atomic_cmpxchg(p_id, old, new) != old);
509 EXPORT_SYMBOL(ip_idents_reserve);
511 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 /* Note the following code is not safe, but this is okay. */
516 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
517 get_random_bytes(&net->ipv4.ip_id_key,
518 sizeof(net->ipv4.ip_id_key));
520 hash = siphash_3u32((__force u32)iph->daddr,
521 (__force u32)iph->saddr,
523 &net->ipv4.ip_id_key);
524 id = ip_idents_reserve(hash, segs);
527 EXPORT_SYMBOL(__ip_select_ident);
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530 const struct sock *sk,
531 const struct iphdr *iph,
533 u8 prot, u32 mark, int flow_flags)
536 const struct inet_sock *inet = inet_sk(sk);
538 oif = sk->sk_bound_dev_if;
540 tos = RT_CONN_FLAGS(sk);
541 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
543 flowi4_init_output(fl4, oif, mark, tos,
544 RT_SCOPE_UNIVERSE, prot,
546 iph->daddr, iph->saddr, 0, 0,
547 sock_net_uid(net, sk));
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551 const struct sock *sk)
553 const struct net *net = dev_net(skb->dev);
554 const struct iphdr *iph = ip_hdr(skb);
555 int oif = skb->dev->ifindex;
556 u8 tos = RT_TOS(iph->tos);
557 u8 prot = iph->protocol;
558 u32 mark = skb->mark;
560 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
565 const struct inet_sock *inet = inet_sk(sk);
566 const struct ip_options_rcu *inet_opt;
567 __be32 daddr = inet->inet_daddr;
570 inet_opt = rcu_dereference(inet->inet_opt);
571 if (inet_opt && inet_opt->opt.srr)
572 daddr = inet_opt->opt.faddr;
573 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 inet_sk_flowi_flags(sk),
577 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 const struct sk_buff *skb)
585 build_skb_flow_key(fl4, skb, sk);
587 build_sk_flow_key(fl4, sk);
590 static DEFINE_SPINLOCK(fnhe_lock);
592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
596 rt = rcu_dereference(fnhe->fnhe_rth_input);
598 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
599 dst_dev_put(&rt->dst);
600 dst_release(&rt->dst);
602 rt = rcu_dereference(fnhe->fnhe_rth_output);
604 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
605 dst_dev_put(&rt->dst);
606 dst_release(&rt->dst);
610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
612 struct fib_nh_exception *fnhe, *oldest;
614 oldest = rcu_dereference(hash->chain);
615 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
616 fnhe = rcu_dereference(fnhe->fnhe_next)) {
617 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 fnhe_flush_routes(oldest);
624 static inline u32 fnhe_hashfun(__be32 daddr)
626 static u32 fnhe_hashrnd __read_mostly;
629 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
630 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
631 return hash_32(hval, FNHE_HASH_SHIFT);
634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
636 rt->rt_pmtu = fnhe->fnhe_pmtu;
637 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
638 rt->dst.expires = fnhe->fnhe_expires;
641 rt->rt_flags |= RTCF_REDIRECTED;
642 rt->rt_gw_family = AF_INET;
643 rt->rt_gw4 = fnhe->fnhe_gw;
647 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
648 __be32 gw, u32 pmtu, bool lock,
649 unsigned long expires)
651 struct fnhe_hash_bucket *hash;
652 struct fib_nh_exception *fnhe;
658 genid = fnhe_genid(dev_net(nhc->nhc_dev));
659 hval = fnhe_hashfun(daddr);
661 spin_lock_bh(&fnhe_lock);
663 hash = rcu_dereference(nhc->nhc_exceptions);
665 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
668 rcu_assign_pointer(nhc->nhc_exceptions, hash);
674 for (fnhe = rcu_dereference(hash->chain); fnhe;
675 fnhe = rcu_dereference(fnhe->fnhe_next)) {
676 if (fnhe->fnhe_daddr == daddr)
682 if (fnhe->fnhe_genid != genid)
683 fnhe->fnhe_genid = genid;
687 fnhe->fnhe_pmtu = pmtu;
688 fnhe->fnhe_mtu_locked = lock;
690 fnhe->fnhe_expires = max(1UL, expires);
691 /* Update all cached dsts too */
692 rt = rcu_dereference(fnhe->fnhe_rth_input);
694 fill_route_from_fnhe(rt, fnhe);
695 rt = rcu_dereference(fnhe->fnhe_rth_output);
697 fill_route_from_fnhe(rt, fnhe);
699 if (depth > FNHE_RECLAIM_DEPTH)
700 fnhe = fnhe_oldest(hash);
702 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
706 fnhe->fnhe_next = hash->chain;
707 rcu_assign_pointer(hash->chain, fnhe);
709 fnhe->fnhe_genid = genid;
710 fnhe->fnhe_daddr = daddr;
712 fnhe->fnhe_pmtu = pmtu;
713 fnhe->fnhe_mtu_locked = lock;
714 fnhe->fnhe_expires = max(1UL, expires);
716 /* Exception created; mark the cached routes for the nexthop
717 * stale, so anyone caching it rechecks if this exception
720 rt = rcu_dereference(nhc->nhc_rth_input);
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
724 for_each_possible_cpu(i) {
725 struct rtable __rcu **prt;
726 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
727 rt = rcu_dereference(*prt);
729 rt->dst.obsolete = DST_OBSOLETE_KILL;
733 fnhe->fnhe_stamp = jiffies;
736 spin_unlock_bh(&fnhe_lock);
739 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
742 __be32 new_gw = icmp_hdr(skb)->un.gateway;
743 __be32 old_gw = ip_hdr(skb)->saddr;
744 struct net_device *dev = skb->dev;
745 struct in_device *in_dev;
746 struct fib_result res;
750 switch (icmp_hdr(skb)->code & 7) {
752 case ICMP_REDIR_NETTOS:
753 case ICMP_REDIR_HOST:
754 case ICMP_REDIR_HOSTTOS:
761 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
764 in_dev = __in_dev_get_rcu(dev);
769 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
770 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
771 ipv4_is_zeronet(new_gw))
772 goto reject_redirect;
774 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
775 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
776 goto reject_redirect;
777 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
778 goto reject_redirect;
780 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
781 goto reject_redirect;
784 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
786 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
788 if (!(n->nud_state & NUD_VALID)) {
789 neigh_event_send(n, NULL);
791 if (fib_lookup(net, fl4, &res, 0) == 0) {
792 struct fib_nh_common *nhc = FIB_RES_NHC(res);
794 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
796 jiffies + ip_rt_gc_timeout);
799 rt->dst.obsolete = DST_OBSOLETE_KILL;
800 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
807 #ifdef CONFIG_IP_ROUTE_VERBOSE
808 if (IN_DEV_LOG_MARTIANS(in_dev)) {
809 const struct iphdr *iph = (const struct iphdr *) skb->data;
810 __be32 daddr = iph->daddr;
811 __be32 saddr = iph->saddr;
813 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
814 " Advised path = %pI4 -> %pI4\n",
815 &old_gw, dev->name, &new_gw,
822 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
826 const struct iphdr *iph = (const struct iphdr *) skb->data;
827 struct net *net = dev_net(skb->dev);
828 int oif = skb->dev->ifindex;
829 u8 tos = RT_TOS(iph->tos);
830 u8 prot = iph->protocol;
831 u32 mark = skb->mark;
833 rt = (struct rtable *) dst;
835 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
836 __ip_do_redirect(rt, skb, &fl4, true);
839 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841 struct rtable *rt = (struct rtable *)dst;
842 struct dst_entry *ret = dst;
845 if (dst->obsolete > 0) {
848 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
859 * 1. The first ip_rt_redirect_number redirects are sent
860 * with exponential backoff, then we stop sending them at all,
861 * assuming that the host ignores our redirects.
862 * 2. If we did not see packets requiring redirects
863 * during ip_rt_redirect_silence, we assume that the host
864 * forgot redirected route and start to send redirects again.
866 * This algorithm is much cheaper and more intelligent than dumb load limiting
869 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
870 * and "frag. need" (breaks PMTU discovery) in icmp.c.
873 void ip_rt_send_redirect(struct sk_buff *skb)
875 struct rtable *rt = skb_rtable(skb);
876 struct in_device *in_dev;
877 struct inet_peer *peer;
883 in_dev = __in_dev_get_rcu(rt->dst.dev);
884 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
888 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
889 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
892 net = dev_net(rt->dst.dev);
893 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
896 rt_nexthop(rt, ip_hdr(skb)->daddr));
900 /* No redirected packets during ip_rt_redirect_silence;
901 * reset the algorithm.
903 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
904 peer->rate_tokens = 0;
905 peer->n_redirects = 0;
908 /* Too many ignored redirects; do not send anything
909 * set dst.rate_last to the last seen redirected packet.
911 if (peer->n_redirects >= ip_rt_redirect_number) {
912 peer->rate_last = jiffies;
916 /* Check for load limit; set rate_last to the latest sent
919 if (peer->rate_tokens == 0 ||
922 (ip_rt_redirect_load << peer->rate_tokens)))) {
923 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
925 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
926 peer->rate_last = jiffies;
929 #ifdef CONFIG_IP_ROUTE_VERBOSE
931 peer->rate_tokens == ip_rt_redirect_number)
932 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933 &ip_hdr(skb)->saddr, inet_iif(skb),
934 &ip_hdr(skb)->daddr, &gw);
941 static int ip_error(struct sk_buff *skb)
943 struct rtable *rt = skb_rtable(skb);
944 struct net_device *dev = skb->dev;
945 struct in_device *in_dev;
946 struct inet_peer *peer;
952 if (netif_is_l3_master(skb->dev)) {
953 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
958 in_dev = __in_dev_get_rcu(dev);
960 /* IP on this device is disabled. */
964 net = dev_net(rt->dst.dev);
965 if (!IN_DEV_FORWARD(in_dev)) {
966 switch (rt->dst.error) {
968 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
972 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
978 switch (rt->dst.error) {
983 code = ICMP_HOST_UNREACH;
986 code = ICMP_NET_UNREACH;
987 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
990 code = ICMP_PKT_FILTERED;
994 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
995 l3mdev_master_ifindex(skb->dev), 1);
1000 peer->rate_tokens += now - peer->rate_last;
1001 if (peer->rate_tokens > ip_rt_error_burst)
1002 peer->rate_tokens = ip_rt_error_burst;
1003 peer->rate_last = now;
1004 if (peer->rate_tokens >= ip_rt_error_cost)
1005 peer->rate_tokens -= ip_rt_error_cost;
1011 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1013 out: kfree_skb(skb);
1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1019 struct dst_entry *dst = &rt->dst;
1020 u32 old_mtu = ipv4_mtu(dst);
1021 struct fib_result res;
1024 if (ip_mtu_locked(dst))
1030 if (mtu < ip_rt_min_pmtu) {
1032 mtu = min(old_mtu, ip_rt_min_pmtu);
1035 if (rt->rt_pmtu == mtu && !lock &&
1036 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1040 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1041 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1043 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1044 jiffies + ip_rt_mtu_expires);
1049 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1050 struct sk_buff *skb, u32 mtu)
1052 struct rtable *rt = (struct rtable *) dst;
1055 ip_rt_build_flow_key(&fl4, sk, skb);
1056 __ip_rt_update_pmtu(rt, &fl4, mtu);
1059 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1060 int oif, u8 protocol)
1062 const struct iphdr *iph = (const struct iphdr *) skb->data;
1065 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1067 __build_flow_key(net, &fl4, NULL, iph, oif,
1068 RT_TOS(iph->tos), protocol, mark, 0);
1069 rt = __ip_route_output_key(net, &fl4);
1071 __ip_rt_update_pmtu(rt, &fl4, mtu);
1075 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1077 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1079 const struct iphdr *iph = (const struct iphdr *) skb->data;
1083 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1085 if (!fl4.flowi4_mark)
1086 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1088 rt = __ip_route_output_key(sock_net(sk), &fl4);
1090 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1097 const struct iphdr *iph = (const struct iphdr *) skb->data;
1100 struct dst_entry *odst = NULL;
1102 struct net *net = sock_net(sk);
1106 if (!ip_sk_accept_pmtu(sk))
1109 odst = sk_dst_get(sk);
1111 if (sock_owned_by_user(sk) || !odst) {
1112 __ipv4_sk_update_pmtu(skb, sk, mtu);
1116 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1118 rt = (struct rtable *)odst;
1119 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1120 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1129 if (!dst_check(&rt->dst, 0)) {
1131 dst_release(&rt->dst);
1133 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1141 sk_dst_set(sk, &rt->dst);
1147 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1149 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1150 int oif, u8 protocol)
1152 const struct iphdr *iph = (const struct iphdr *) skb->data;
1156 __build_flow_key(net, &fl4, NULL, iph, oif,
1157 RT_TOS(iph->tos), protocol, 0, 0);
1158 rt = __ip_route_output_key(net, &fl4);
1160 __ip_do_redirect(rt, skb, &fl4, false);
1164 EXPORT_SYMBOL_GPL(ipv4_redirect);
1166 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1168 const struct iphdr *iph = (const struct iphdr *) skb->data;
1171 struct net *net = sock_net(sk);
1173 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1174 rt = __ip_route_output_key(net, &fl4);
1176 __ip_do_redirect(rt, skb, &fl4, false);
1180 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1182 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1184 struct rtable *rt = (struct rtable *) dst;
1186 /* All IPV4 dsts are created with ->obsolete set to the value
1187 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1188 * into this function always.
1190 * When a PMTU/redirect information update invalidates a route,
1191 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1192 * DST_OBSOLETE_DEAD.
1194 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1199 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1201 struct ip_options opt;
1204 /* Recompile ip options since IPCB may not be valid anymore.
1205 * Also check we have a reasonable ipv4 header.
1207 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1208 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1211 memset(&opt, 0, sizeof(opt));
1212 if (ip_hdr(skb)->ihl > 5) {
1213 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1215 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1218 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1224 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1227 static void ipv4_link_failure(struct sk_buff *skb)
1231 ipv4_send_dest_unreach(skb);
1233 rt = skb_rtable(skb);
1235 dst_set_expires(&rt->dst, 0);
1238 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1240 pr_debug("%s: %pI4 -> %pI4, %s\n",
1241 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1242 skb->dev ? skb->dev->name : "?");
1249 We do not cache source address of outgoing interface,
1250 because it is used only by IP RR, TS and SRR options,
1251 so that it out of fast path.
1253 BTW remember: "addr" is allowed to be not aligned
1257 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1261 if (rt_is_output_route(rt))
1262 src = ip_hdr(skb)->saddr;
1264 struct fib_result res;
1265 struct iphdr *iph = ip_hdr(skb);
1266 struct flowi4 fl4 = {
1267 .daddr = iph->daddr,
1268 .saddr = iph->saddr,
1269 .flowi4_tos = RT_TOS(iph->tos),
1270 .flowi4_oif = rt->dst.dev->ifindex,
1271 .flowi4_iif = skb->dev->ifindex,
1272 .flowi4_mark = skb->mark,
1276 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1277 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1279 src = inet_select_addr(rt->dst.dev,
1280 rt_nexthop(rt, iph->daddr),
1284 memcpy(addr, &src, 4);
1287 #ifdef CONFIG_IP_ROUTE_CLASSID
1288 static void set_class_tag(struct rtable *rt, u32 tag)
1290 if (!(rt->dst.tclassid & 0xFFFF))
1291 rt->dst.tclassid |= tag & 0xFFFF;
1292 if (!(rt->dst.tclassid & 0xFFFF0000))
1293 rt->dst.tclassid |= tag & 0xFFFF0000;
1297 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1299 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1300 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1303 return min(advmss, IPV4_MAX_PMTU - header_size);
1306 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1308 const struct rtable *rt = (const struct rtable *) dst;
1309 unsigned int mtu = rt->rt_pmtu;
1311 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1312 mtu = dst_metric_raw(dst, RTAX_MTU);
1317 mtu = READ_ONCE(dst->dev->mtu);
1319 if (unlikely(ip_mtu_locked(dst))) {
1320 if (rt->rt_gw_family && mtu > 576)
1324 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1326 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1329 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1331 struct fnhe_hash_bucket *hash;
1332 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1333 u32 hval = fnhe_hashfun(daddr);
1335 spin_lock_bh(&fnhe_lock);
1337 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1338 lockdep_is_held(&fnhe_lock));
1341 fnhe_p = &hash->chain;
1342 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1344 if (fnhe->fnhe_daddr == daddr) {
1345 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1346 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1347 /* set fnhe_daddr to 0 to ensure it won't bind with
1348 * new dsts in rt_bind_exception().
1350 fnhe->fnhe_daddr = 0;
1351 fnhe_flush_routes(fnhe);
1352 kfree_rcu(fnhe, rcu);
1355 fnhe_p = &fnhe->fnhe_next;
1356 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1357 lockdep_is_held(&fnhe_lock));
1360 spin_unlock_bh(&fnhe_lock);
1363 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1366 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1367 struct fib_nh_exception *fnhe;
1373 hval = fnhe_hashfun(daddr);
1375 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1376 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1377 if (fnhe->fnhe_daddr == daddr) {
1378 if (fnhe->fnhe_expires &&
1379 time_after(jiffies, fnhe->fnhe_expires)) {
1380 ip_del_fnhe(nhc, daddr);
1390 * 1. mtu on route is locked - use it
1391 * 2. mtu from nexthop exception
1392 * 3. mtu from egress device
1395 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1397 struct fib_nh_common *nhc = res->nhc;
1398 struct net_device *dev = nhc->nhc_dev;
1399 struct fib_info *fi = res->fi;
1402 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1403 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1407 struct fib_nh_exception *fnhe;
1409 fnhe = find_exception(nhc, daddr);
1410 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1411 mtu = fnhe->fnhe_pmtu;
1415 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1417 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1420 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1421 __be32 daddr, const bool do_cache)
1425 spin_lock_bh(&fnhe_lock);
1427 if (daddr == fnhe->fnhe_daddr) {
1428 struct rtable __rcu **porig;
1429 struct rtable *orig;
1430 int genid = fnhe_genid(dev_net(rt->dst.dev));
1432 if (rt_is_input_route(rt))
1433 porig = &fnhe->fnhe_rth_input;
1435 porig = &fnhe->fnhe_rth_output;
1436 orig = rcu_dereference(*porig);
1438 if (fnhe->fnhe_genid != genid) {
1439 fnhe->fnhe_genid = genid;
1441 fnhe->fnhe_pmtu = 0;
1442 fnhe->fnhe_expires = 0;
1443 fnhe->fnhe_mtu_locked = false;
1444 fnhe_flush_routes(fnhe);
1447 fill_route_from_fnhe(rt, fnhe);
1450 rt->rt_gw_family = AF_INET;
1455 rcu_assign_pointer(*porig, rt);
1457 dst_dev_put(&orig->dst);
1458 dst_release(&orig->dst);
1463 fnhe->fnhe_stamp = jiffies;
1465 spin_unlock_bh(&fnhe_lock);
1470 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1472 struct rtable *orig, *prev, **p;
1475 if (rt_is_input_route(rt)) {
1476 p = (struct rtable **)&nhc->nhc_rth_input;
1478 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1482 /* hold dst before doing cmpxchg() to avoid race condition
1486 prev = cmpxchg(p, orig, rt);
1489 dst_dev_put(&orig->dst);
1490 dst_release(&orig->dst);
1493 dst_release(&rt->dst);
1500 struct uncached_list {
1502 struct list_head head;
1505 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1507 void rt_add_uncached_list(struct rtable *rt)
1509 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1511 rt->rt_uncached_list = ul;
1513 spin_lock_bh(&ul->lock);
1514 list_add_tail(&rt->rt_uncached, &ul->head);
1515 spin_unlock_bh(&ul->lock);
1518 void rt_del_uncached_list(struct rtable *rt)
1520 if (!list_empty(&rt->rt_uncached)) {
1521 struct uncached_list *ul = rt->rt_uncached_list;
1523 spin_lock_bh(&ul->lock);
1524 list_del(&rt->rt_uncached);
1525 spin_unlock_bh(&ul->lock);
1529 static void ipv4_dst_destroy(struct dst_entry *dst)
1531 struct rtable *rt = (struct rtable *)dst;
1533 ip_dst_metrics_put(dst);
1534 rt_del_uncached_list(rt);
1537 void rt_flush_dev(struct net_device *dev)
1539 struct net *net = dev_net(dev);
1543 for_each_possible_cpu(cpu) {
1544 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1546 spin_lock_bh(&ul->lock);
1547 list_for_each_entry(rt, &ul->head, rt_uncached) {
1548 if (rt->dst.dev != dev)
1550 rt->dst.dev = net->loopback_dev;
1551 dev_hold(rt->dst.dev);
1554 spin_unlock_bh(&ul->lock);
1558 static bool rt_cache_valid(const struct rtable *rt)
1561 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1565 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1566 const struct fib_result *res,
1567 struct fib_nh_exception *fnhe,
1568 struct fib_info *fi, u16 type, u32 itag,
1569 const bool do_cache)
1571 bool cached = false;
1574 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1576 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1577 rt->rt_gw_family = nhc->nhc_gw_family;
1578 /* only INET and INET6 are supported */
1579 if (likely(nhc->nhc_gw_family == AF_INET))
1580 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1582 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1585 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1587 #ifdef CONFIG_IP_ROUTE_CLASSID
1591 nh = container_of(nhc, struct fib_nh, nh_common);
1592 rt->dst.tclassid = nh->nh_tclassid;
1595 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1597 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1599 cached = rt_cache_route(nhc, rt);
1600 if (unlikely(!cached)) {
1601 /* Routes we intend to cache in nexthop exception or
1602 * FIB nexthop have the DST_NOCACHE bit clear.
1603 * However, if we are unsuccessful at storing this
1604 * route into the cache we really need to set it.
1607 rt->rt_gw_family = AF_INET;
1610 rt_add_uncached_list(rt);
1613 rt_add_uncached_list(rt);
1615 #ifdef CONFIG_IP_ROUTE_CLASSID
1616 #ifdef CONFIG_IP_MULTIPLE_TABLES
1617 set_class_tag(rt, res->tclassid);
1619 set_class_tag(rt, itag);
1623 struct rtable *rt_dst_alloc(struct net_device *dev,
1624 unsigned int flags, u16 type,
1625 bool nopolicy, bool noxfrm, bool will_cache)
1629 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1630 (will_cache ? 0 : DST_HOST) |
1631 (nopolicy ? DST_NOPOLICY : 0) |
1632 (noxfrm ? DST_NOXFRM : 0));
1635 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1636 rt->rt_flags = flags;
1638 rt->rt_is_input = 0;
1641 rt->rt_mtu_locked = 0;
1642 rt->rt_gw_family = 0;
1644 INIT_LIST_HEAD(&rt->rt_uncached);
1646 rt->dst.output = ip_output;
1647 if (flags & RTCF_LOCAL)
1648 rt->dst.input = ip_local_deliver;
1653 EXPORT_SYMBOL(rt_dst_alloc);
1655 /* called in rcu_read_lock() section */
1656 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1657 u8 tos, struct net_device *dev,
1658 struct in_device *in_dev, u32 *itag)
1662 /* Primary sanity checks. */
1666 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1667 skb->protocol != htons(ETH_P_IP))
1670 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1673 if (ipv4_is_zeronet(saddr)) {
1674 if (!ipv4_is_local_multicast(daddr) &&
1675 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1678 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1686 /* called in rcu_read_lock() section */
1687 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688 u8 tos, struct net_device *dev, int our)
1690 struct in_device *in_dev = __in_dev_get_rcu(dev);
1691 unsigned int flags = RTCF_MULTICAST;
1696 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1701 flags |= RTCF_LOCAL;
1703 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1704 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1708 #ifdef CONFIG_IP_ROUTE_CLASSID
1709 rth->dst.tclassid = itag;
1711 rth->dst.output = ip_rt_bug;
1712 rth->rt_is_input= 1;
1714 #ifdef CONFIG_IP_MROUTE
1715 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1716 rth->dst.input = ip_mr_input;
1718 RT_CACHE_STAT_INC(in_slow_mc);
1720 skb_dst_set(skb, &rth->dst);
1725 static void ip_handle_martian_source(struct net_device *dev,
1726 struct in_device *in_dev,
1727 struct sk_buff *skb,
1731 RT_CACHE_STAT_INC(in_martian_src);
1732 #ifdef CONFIG_IP_ROUTE_VERBOSE
1733 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1735 * RFC1812 recommendation, if source is martian,
1736 * the only hint is MAC header.
1738 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1739 &daddr, &saddr, dev->name);
1740 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1741 print_hex_dump(KERN_WARNING, "ll header: ",
1742 DUMP_PREFIX_OFFSET, 16, 1,
1743 skb_mac_header(skb),
1744 dev->hard_header_len, false);
1750 /* called in rcu_read_lock() section */
1751 static int __mkroute_input(struct sk_buff *skb,
1752 const struct fib_result *res,
1753 struct in_device *in_dev,
1754 __be32 daddr, __be32 saddr, u32 tos)
1756 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1757 struct net_device *dev = nhc->nhc_dev;
1758 struct fib_nh_exception *fnhe;
1761 struct in_device *out_dev;
1765 /* get a working reference to the output device */
1766 out_dev = __in_dev_get_rcu(dev);
1768 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1772 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1773 in_dev->dev, in_dev, &itag);
1775 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1781 do_cache = res->fi && !itag;
1782 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1783 skb->protocol == htons(ETH_P_IP)) {
1786 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1787 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1788 inet_addr_onlink(out_dev, saddr, gw))
1789 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1792 if (skb->protocol != htons(ETH_P_IP)) {
1793 /* Not IP (i.e. ARP). Do not create route, if it is
1794 * invalid for proxy arp. DNAT routes are always valid.
1796 * Proxy arp feature have been extended to allow, ARP
1797 * replies back to the same interface, to support
1798 * Private VLAN switch technologies. See arp.c.
1800 if (out_dev == in_dev &&
1801 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1807 fnhe = find_exception(nhc, daddr);
1810 rth = rcu_dereference(fnhe->fnhe_rth_input);
1812 rth = rcu_dereference(nhc->nhc_rth_input);
1813 if (rt_cache_valid(rth)) {
1814 skb_dst_set_noref(skb, &rth->dst);
1819 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1820 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1821 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1827 rth->rt_is_input = 1;
1828 RT_CACHE_STAT_INC(in_slow_tot);
1830 rth->dst.input = ip_forward;
1832 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1834 lwtunnel_set_redirect(&rth->dst);
1835 skb_dst_set(skb, &rth->dst);
1842 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1843 /* To make ICMP packets follow the right flow, the multipath hash is
1844 * calculated from the inner IP addresses.
1846 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1847 struct flow_keys *hash_keys)
1849 const struct iphdr *outer_iph = ip_hdr(skb);
1850 const struct iphdr *key_iph = outer_iph;
1851 const struct iphdr *inner_iph;
1852 const struct icmphdr *icmph;
1853 struct iphdr _inner_iph;
1854 struct icmphdr _icmph;
1856 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1859 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1862 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1867 if (icmph->type != ICMP_DEST_UNREACH &&
1868 icmph->type != ICMP_REDIRECT &&
1869 icmph->type != ICMP_TIME_EXCEEDED &&
1870 icmph->type != ICMP_PARAMETERPROB)
1873 inner_iph = skb_header_pointer(skb,
1874 outer_iph->ihl * 4 + sizeof(_icmph),
1875 sizeof(_inner_iph), &_inner_iph);
1879 key_iph = inner_iph;
1881 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1882 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1885 /* if skb is set it will be used and fl4 can be NULL */
1886 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1887 const struct sk_buff *skb, struct flow_keys *flkeys)
1889 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1890 struct flow_keys hash_keys;
1893 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1895 memset(&hash_keys, 0, sizeof(hash_keys));
1896 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1898 ip_multipath_l3_keys(skb, &hash_keys);
1900 hash_keys.addrs.v4addrs.src = fl4->saddr;
1901 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1905 /* skb is currently provided only when forwarding */
1907 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1908 struct flow_keys keys;
1910 /* short-circuit if we already have L4 hash present */
1912 return skb_get_hash_raw(skb) >> 1;
1914 memset(&hash_keys, 0, sizeof(hash_keys));
1917 skb_flow_dissect_flow_keys(skb, &keys, flag);
1921 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1922 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1923 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1924 hash_keys.ports.src = flkeys->ports.src;
1925 hash_keys.ports.dst = flkeys->ports.dst;
1926 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1928 memset(&hash_keys, 0, sizeof(hash_keys));
1929 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1930 hash_keys.addrs.v4addrs.src = fl4->saddr;
1931 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1932 hash_keys.ports.src = fl4->fl4_sport;
1933 hash_keys.ports.dst = fl4->fl4_dport;
1934 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1938 mhash = flow_hash_from_keys(&hash_keys);
1941 mhash = jhash_2words(mhash, multipath_hash, 0);
1945 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1947 static int ip_mkroute_input(struct sk_buff *skb,
1948 struct fib_result *res,
1949 struct in_device *in_dev,
1950 __be32 daddr, __be32 saddr, u32 tos,
1951 struct flow_keys *hkeys)
1953 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1954 if (res->fi && fib_info_num_path(res->fi) > 1) {
1955 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1957 fib_select_multipath(res, h);
1961 /* create a routing cache entry */
1962 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1966 * NOTE. We drop all the packets that has local source
1967 * addresses, because every properly looped back packet
1968 * must have correct destination already attached by output routine.
1970 * Such approach solves two big problems:
1971 * 1. Not simplex devices are handled properly.
1972 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1973 * called with rcu_read_lock()
1976 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1977 u8 tos, struct net_device *dev,
1978 struct fib_result *res)
1980 struct in_device *in_dev = __in_dev_get_rcu(dev);
1981 struct flow_keys *flkeys = NULL, _flkeys;
1982 struct net *net = dev_net(dev);
1983 struct ip_tunnel_info *tun_info;
1985 unsigned int flags = 0;
1991 /* IP on this device is disabled. */
1996 /* Check for the most weird martians, which can be not detected
2000 tun_info = skb_tunnel_info(skb);
2001 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2002 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2004 fl4.flowi4_tun_key.tun_id = 0;
2007 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2008 goto martian_source;
2012 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2015 /* Accept zero addresses only to limited broadcast;
2016 * I even do not know to fix it or not. Waiting for complains :-)
2018 if (ipv4_is_zeronet(saddr))
2019 goto martian_source;
2021 if (ipv4_is_zeronet(daddr))
2022 goto martian_destination;
2024 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2025 * and call it once if daddr or/and saddr are loopback addresses
2027 if (ipv4_is_loopback(daddr)) {
2028 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2029 goto martian_destination;
2030 } else if (ipv4_is_loopback(saddr)) {
2031 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2032 goto martian_source;
2036 * Now we are ready to route packet.
2039 fl4.flowi4_iif = dev->ifindex;
2040 fl4.flowi4_mark = skb->mark;
2041 fl4.flowi4_tos = tos;
2042 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2043 fl4.flowi4_flags = 0;
2046 fl4.flowi4_uid = sock_net_uid(net, NULL);
2048 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2051 fl4.flowi4_proto = 0;
2056 err = fib_lookup(net, &fl4, res, 0);
2058 if (!IN_DEV_FORWARD(in_dev))
2059 err = -EHOSTUNREACH;
2063 if (res->type == RTN_BROADCAST) {
2064 if (IN_DEV_BFORWARD(in_dev))
2069 if (res->type == RTN_LOCAL) {
2070 err = fib_validate_source(skb, saddr, daddr, tos,
2071 0, dev, in_dev, &itag);
2073 goto martian_source;
2077 if (!IN_DEV_FORWARD(in_dev)) {
2078 err = -EHOSTUNREACH;
2081 if (res->type != RTN_UNICAST)
2082 goto martian_destination;
2085 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2089 if (skb->protocol != htons(ETH_P_IP))
2092 if (!ipv4_is_zeronet(saddr)) {
2093 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2096 goto martian_source;
2098 flags |= RTCF_BROADCAST;
2099 res->type = RTN_BROADCAST;
2100 RT_CACHE_STAT_INC(in_brd);
2106 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2108 rth = rcu_dereference(nhc->nhc_rth_input);
2109 if (rt_cache_valid(rth)) {
2110 skb_dst_set_noref(skb, &rth->dst);
2118 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2119 flags | RTCF_LOCAL, res->type,
2120 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2124 rth->dst.output= ip_rt_bug;
2125 #ifdef CONFIG_IP_ROUTE_CLASSID
2126 rth->dst.tclassid = itag;
2128 rth->rt_is_input = 1;
2130 RT_CACHE_STAT_INC(in_slow_tot);
2131 if (res->type == RTN_UNREACHABLE) {
2132 rth->dst.input= ip_error;
2133 rth->dst.error= -err;
2134 rth->rt_flags &= ~RTCF_LOCAL;
2138 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2140 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2141 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2142 WARN_ON(rth->dst.input == lwtunnel_input);
2143 rth->dst.lwtstate->orig_input = rth->dst.input;
2144 rth->dst.input = lwtunnel_input;
2147 if (unlikely(!rt_cache_route(nhc, rth)))
2148 rt_add_uncached_list(rth);
2150 skb_dst_set(skb, &rth->dst);
2155 RT_CACHE_STAT_INC(in_no_route);
2156 res->type = RTN_UNREACHABLE;
2162 * Do not cache martian addresses: they should be logged (RFC1812)
2164 martian_destination:
2165 RT_CACHE_STAT_INC(in_martian_dst);
2166 #ifdef CONFIG_IP_ROUTE_VERBOSE
2167 if (IN_DEV_LOG_MARTIANS(in_dev))
2168 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2169 &daddr, &saddr, dev->name);
2181 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2185 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2186 u8 tos, struct net_device *dev)
2188 struct fib_result res;
2191 tos &= IPTOS_RT_MASK;
2193 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2198 EXPORT_SYMBOL(ip_route_input_noref);
2200 /* called with rcu_read_lock held */
2201 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2202 u8 tos, struct net_device *dev, struct fib_result *res)
2204 /* Multicast recognition logic is moved from route cache to here.
2205 The problem was that too many Ethernet cards have broken/missing
2206 hardware multicast filters :-( As result the host on multicasting
2207 network acquires a lot of useless route cache entries, sort of
2208 SDR messages from all the world. Now we try to get rid of them.
2209 Really, provided software IP multicast filter is organized
2210 reasonably (at least, hashed), it does not result in a slowdown
2211 comparing with route cache reject entries.
2212 Note, that multicast routers are not affected, because
2213 route cache entry is created eventually.
2215 if (ipv4_is_multicast(daddr)) {
2216 struct in_device *in_dev = __in_dev_get_rcu(dev);
2222 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2223 ip_hdr(skb)->protocol);
2225 /* check l3 master if no match yet */
2226 if (!our && netif_is_l3_slave(dev)) {
2227 struct in_device *l3_in_dev;
2229 l3_in_dev = __in_dev_get_rcu(skb->dev);
2231 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2232 ip_hdr(skb)->protocol);
2236 #ifdef CONFIG_IP_MROUTE
2238 (!ipv4_is_local_multicast(daddr) &&
2239 IN_DEV_MFORWARD(in_dev))
2242 err = ip_route_input_mc(skb, daddr, saddr,
2248 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2251 /* called with rcu_read_lock() */
2252 static struct rtable *__mkroute_output(const struct fib_result *res,
2253 const struct flowi4 *fl4, int orig_oif,
2254 struct net_device *dev_out,
2257 struct fib_info *fi = res->fi;
2258 struct fib_nh_exception *fnhe;
2259 struct in_device *in_dev;
2260 u16 type = res->type;
2264 in_dev = __in_dev_get_rcu(dev_out);
2266 return ERR_PTR(-EINVAL);
2268 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2269 if (ipv4_is_loopback(fl4->saddr) &&
2270 !(dev_out->flags & IFF_LOOPBACK) &&
2271 !netif_is_l3_master(dev_out))
2272 return ERR_PTR(-EINVAL);
2274 if (ipv4_is_lbcast(fl4->daddr))
2275 type = RTN_BROADCAST;
2276 else if (ipv4_is_multicast(fl4->daddr))
2277 type = RTN_MULTICAST;
2278 else if (ipv4_is_zeronet(fl4->daddr))
2279 return ERR_PTR(-EINVAL);
2281 if (dev_out->flags & IFF_LOOPBACK)
2282 flags |= RTCF_LOCAL;
2285 if (type == RTN_BROADCAST) {
2286 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2288 } else if (type == RTN_MULTICAST) {
2289 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2290 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2292 flags &= ~RTCF_LOCAL;
2295 /* If multicast route do not exist use
2296 * default one, but do not gateway in this case.
2299 if (fi && res->prefixlen < 4)
2301 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2302 (orig_oif != dev_out->ifindex)) {
2303 /* For local routes that require a particular output interface
2304 * we do not want to cache the result. Caching the result
2305 * causes incorrect behaviour when there are multiple source
2306 * addresses on the interface, the end result being that if the
2307 * intended recipient is waiting on that interface for the
2308 * packet he won't receive it because it will be delivered on
2309 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2310 * be set to the loopback interface as well.
2316 do_cache &= fi != NULL;
2318 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2319 struct rtable __rcu **prth;
2321 fnhe = find_exception(nhc, fl4->daddr);
2325 prth = &fnhe->fnhe_rth_output;
2327 if (unlikely(fl4->flowi4_flags &
2328 FLOWI_FLAG_KNOWN_NH &&
2329 !(nhc->nhc_gw_family &&
2330 nhc->nhc_scope == RT_SCOPE_LINK))) {
2334 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2336 rth = rcu_dereference(*prth);
2337 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2342 rth = rt_dst_alloc(dev_out, flags, type,
2343 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2344 IN_DEV_CONF_GET(in_dev, NOXFRM),
2347 return ERR_PTR(-ENOBUFS);
2349 rth->rt_iif = orig_oif;
2351 RT_CACHE_STAT_INC(out_slow_tot);
2353 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2354 if (flags & RTCF_LOCAL &&
2355 !(dev_out->flags & IFF_LOOPBACK)) {
2356 rth->dst.output = ip_mc_output;
2357 RT_CACHE_STAT_INC(out_slow_mc);
2359 #ifdef CONFIG_IP_MROUTE
2360 if (type == RTN_MULTICAST) {
2361 if (IN_DEV_MFORWARD(in_dev) &&
2362 !ipv4_is_local_multicast(fl4->daddr)) {
2363 rth->dst.input = ip_mr_input;
2364 rth->dst.output = ip_mc_output;
2370 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2371 lwtunnel_set_redirect(&rth->dst);
2377 * Major route resolver routine.
2380 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2381 const struct sk_buff *skb)
2383 __u8 tos = RT_FL_TOS(fl4);
2384 struct fib_result res = {
2392 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2393 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2394 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2395 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2398 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2403 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2405 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2406 struct fib_result *res,
2407 const struct sk_buff *skb)
2409 struct net_device *dev_out = NULL;
2410 int orig_oif = fl4->flowi4_oif;
2411 unsigned int flags = 0;
2413 int err = -ENETUNREACH;
2416 rth = ERR_PTR(-EINVAL);
2417 if (ipv4_is_multicast(fl4->saddr) ||
2418 ipv4_is_lbcast(fl4->saddr) ||
2419 ipv4_is_zeronet(fl4->saddr))
2422 /* I removed check for oif == dev_out->oif here.
2423 It was wrong for two reasons:
2424 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2425 is assigned to multiple interfaces.
2426 2. Moreover, we are allowed to send packets with saddr
2427 of another iface. --ANK
2430 if (fl4->flowi4_oif == 0 &&
2431 (ipv4_is_multicast(fl4->daddr) ||
2432 ipv4_is_lbcast(fl4->daddr))) {
2433 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2434 dev_out = __ip_dev_find(net, fl4->saddr, false);
2438 /* Special hack: user can direct multicasts
2439 and limited broadcast via necessary interface
2440 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2441 This hack is not just for fun, it allows
2442 vic,vat and friends to work.
2443 They bind socket to loopback, set ttl to zero
2444 and expect that it will work.
2445 From the viewpoint of routing cache they are broken,
2446 because we are not allowed to build multicast path
2447 with loopback source addr (look, routing cache
2448 cannot know, that ttl is zero, so that packet
2449 will not leave this host and route is valid).
2450 Luckily, this hack is good workaround.
2453 fl4->flowi4_oif = dev_out->ifindex;
2457 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2458 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2459 if (!__ip_dev_find(net, fl4->saddr, false))
2465 if (fl4->flowi4_oif) {
2466 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2467 rth = ERR_PTR(-ENODEV);
2471 /* RACE: Check return value of inet_select_addr instead. */
2472 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2473 rth = ERR_PTR(-ENETUNREACH);
2476 if (ipv4_is_local_multicast(fl4->daddr) ||
2477 ipv4_is_lbcast(fl4->daddr) ||
2478 fl4->flowi4_proto == IPPROTO_IGMP) {
2480 fl4->saddr = inet_select_addr(dev_out, 0,
2485 if (ipv4_is_multicast(fl4->daddr))
2486 fl4->saddr = inet_select_addr(dev_out, 0,
2488 else if (!fl4->daddr)
2489 fl4->saddr = inet_select_addr(dev_out, 0,
2495 fl4->daddr = fl4->saddr;
2497 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2498 dev_out = net->loopback_dev;
2499 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2500 res->type = RTN_LOCAL;
2501 flags |= RTCF_LOCAL;
2505 err = fib_lookup(net, fl4, res, 0);
2509 if (fl4->flowi4_oif &&
2510 (ipv4_is_multicast(fl4->daddr) ||
2511 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2512 /* Apparently, routing tables are wrong. Assume,
2513 that the destination is on link.
2516 Because we are allowed to send to iface
2517 even if it has NO routes and NO assigned
2518 addresses. When oif is specified, routing
2519 tables are looked up with only one purpose:
2520 to catch if destination is gatewayed, rather than
2521 direct. Moreover, if MSG_DONTROUTE is set,
2522 we send packet, ignoring both routing tables
2523 and ifaddr state. --ANK
2526 We could make it even if oif is unknown,
2527 likely IPv6, but we do not.
2530 if (fl4->saddr == 0)
2531 fl4->saddr = inet_select_addr(dev_out, 0,
2533 res->type = RTN_UNICAST;
2540 if (res->type == RTN_LOCAL) {
2542 if (res->fi->fib_prefsrc)
2543 fl4->saddr = res->fi->fib_prefsrc;
2545 fl4->saddr = fl4->daddr;
2548 /* L3 master device is the loopback for that domain */
2549 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2552 /* make sure orig_oif points to fib result device even
2553 * though packet rx/tx happens over loopback or l3mdev
2555 orig_oif = FIB_RES_OIF(*res);
2557 fl4->flowi4_oif = dev_out->ifindex;
2558 flags |= RTCF_LOCAL;
2562 fib_select_path(net, res, fl4, skb);
2564 dev_out = FIB_RES_DEV(*res);
2565 fl4->flowi4_oif = dev_out->ifindex;
2569 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2575 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2580 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2582 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2584 return mtu ? : dst->dev->mtu;
2587 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2588 struct sk_buff *skb, u32 mtu)
2592 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2593 struct sk_buff *skb)
2597 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2603 static struct dst_ops ipv4_dst_blackhole_ops = {
2605 .check = ipv4_blackhole_dst_check,
2606 .mtu = ipv4_blackhole_mtu,
2607 .default_advmss = ipv4_default_advmss,
2608 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2609 .redirect = ipv4_rt_blackhole_redirect,
2610 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2611 .neigh_lookup = ipv4_neigh_lookup,
2614 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2616 struct rtable *ort = (struct rtable *) dst_orig;
2619 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2621 struct dst_entry *new = &rt->dst;
2624 new->input = dst_discard;
2625 new->output = dst_discard_out;
2627 new->dev = net->loopback_dev;
2631 rt->rt_is_input = ort->rt_is_input;
2632 rt->rt_iif = ort->rt_iif;
2633 rt->rt_pmtu = ort->rt_pmtu;
2634 rt->rt_mtu_locked = ort->rt_mtu_locked;
2636 rt->rt_genid = rt_genid_ipv4(net);
2637 rt->rt_flags = ort->rt_flags;
2638 rt->rt_type = ort->rt_type;
2639 rt->rt_gw_family = ort->rt_gw_family;
2640 if (rt->rt_gw_family == AF_INET)
2641 rt->rt_gw4 = ort->rt_gw4;
2642 else if (rt->rt_gw_family == AF_INET6)
2643 rt->rt_gw6 = ort->rt_gw6;
2645 INIT_LIST_HEAD(&rt->rt_uncached);
2648 dst_release(dst_orig);
2650 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2653 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2654 const struct sock *sk)
2656 struct rtable *rt = __ip_route_output_key(net, flp4);
2661 if (flp4->flowi4_proto)
2662 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2663 flowi4_to_flowi(flp4),
2668 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2670 /* called with rcu_read_lock held */
2671 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2672 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2673 struct sk_buff *skb, u32 portid, u32 seq)
2676 struct nlmsghdr *nlh;
2677 unsigned long expires = 0;
2679 u32 metrics[RTAX_MAX];
2681 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2685 r = nlmsg_data(nlh);
2686 r->rtm_family = AF_INET;
2687 r->rtm_dst_len = 32;
2689 r->rtm_tos = fl4->flowi4_tos;
2690 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2691 if (nla_put_u32(skb, RTA_TABLE, table_id))
2692 goto nla_put_failure;
2693 r->rtm_type = rt->rt_type;
2694 r->rtm_scope = RT_SCOPE_UNIVERSE;
2695 r->rtm_protocol = RTPROT_UNSPEC;
2696 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2697 if (rt->rt_flags & RTCF_NOTIFY)
2698 r->rtm_flags |= RTM_F_NOTIFY;
2699 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2700 r->rtm_flags |= RTCF_DOREDIRECT;
2702 if (nla_put_in_addr(skb, RTA_DST, dst))
2703 goto nla_put_failure;
2705 r->rtm_src_len = 32;
2706 if (nla_put_in_addr(skb, RTA_SRC, src))
2707 goto nla_put_failure;
2710 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2711 goto nla_put_failure;
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713 if (rt->dst.tclassid &&
2714 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2715 goto nla_put_failure;
2717 if (!rt_is_input_route(rt) &&
2718 fl4->saddr != src) {
2719 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2720 goto nla_put_failure;
2722 if (rt->rt_gw_family == AF_INET &&
2723 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2724 goto nla_put_failure;
2725 } else if (rt->rt_gw_family == AF_INET6) {
2726 int alen = sizeof(struct in6_addr);
2730 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2732 goto nla_put_failure;
2734 via = nla_data(nla);
2735 via->rtvia_family = AF_INET6;
2736 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2739 expires = rt->dst.expires;
2741 unsigned long now = jiffies;
2743 if (time_before(now, expires))
2749 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2750 if (rt->rt_pmtu && expires)
2751 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2752 if (rt->rt_mtu_locked && expires)
2753 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2754 if (rtnetlink_put_metrics(skb, metrics) < 0)
2755 goto nla_put_failure;
2757 if (fl4->flowi4_mark &&
2758 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2759 goto nla_put_failure;
2761 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2762 nla_put_u32(skb, RTA_UID,
2763 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2764 goto nla_put_failure;
2766 error = rt->dst.error;
2768 if (rt_is_input_route(rt)) {
2769 #ifdef CONFIG_IP_MROUTE
2770 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2771 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2772 int err = ipmr_get_route(net, skb,
2773 fl4->saddr, fl4->daddr,
2779 goto nla_put_failure;
2783 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2784 goto nla_put_failure;
2787 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2788 goto nla_put_failure;
2790 nlmsg_end(skb, nlh);
2794 nlmsg_cancel(skb, nlh);
2798 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2799 u8 ip_proto, __be16 sport,
2802 struct sk_buff *skb;
2805 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2809 /* Reserve room for dummy headers, this skb can pass
2810 * through good chunk of routing engine.
2812 skb_reset_mac_header(skb);
2813 skb_reset_network_header(skb);
2814 skb->protocol = htons(ETH_P_IP);
2815 iph = skb_put(skb, sizeof(struct iphdr));
2816 iph->protocol = ip_proto;
2822 skb_set_transport_header(skb, skb->len);
2824 switch (iph->protocol) {
2826 struct udphdr *udph;
2828 udph = skb_put_zero(skb, sizeof(struct udphdr));
2829 udph->source = sport;
2831 udph->len = sizeof(struct udphdr);
2836 struct tcphdr *tcph;
2838 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2839 tcph->source = sport;
2841 tcph->doff = sizeof(struct tcphdr) / 4;
2843 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2847 case IPPROTO_ICMP: {
2848 struct icmphdr *icmph;
2850 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2851 icmph->type = ICMP_ECHO;
2859 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2860 const struct nlmsghdr *nlh,
2862 struct netlink_ext_ack *extack)
2867 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2868 NL_SET_ERR_MSG(extack,
2869 "ipv4: Invalid header for route get request");
2873 if (!netlink_strict_get_check(skb))
2874 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2875 rtm_ipv4_policy, extack);
2877 rtm = nlmsg_data(nlh);
2878 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2879 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2880 rtm->rtm_table || rtm->rtm_protocol ||
2881 rtm->rtm_scope || rtm->rtm_type) {
2882 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2886 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2887 RTM_F_LOOKUP_TABLE |
2889 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2893 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2894 rtm_ipv4_policy, extack);
2898 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2899 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2900 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2904 for (i = 0; i <= RTA_MAX; i++) {
2920 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2928 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2929 struct netlink_ext_ack *extack)
2931 struct net *net = sock_net(in_skb->sk);
2932 struct nlattr *tb[RTA_MAX+1];
2933 u32 table_id = RT_TABLE_MAIN;
2934 __be16 sport = 0, dport = 0;
2935 struct fib_result res = {};
2936 u8 ip_proto = IPPROTO_UDP;
2937 struct rtable *rt = NULL;
2938 struct sk_buff *skb;
2940 struct flowi4 fl4 = {};
2948 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2952 rtm = nlmsg_data(nlh);
2953 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2954 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2955 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2956 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2958 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2960 uid = (iif ? INVALID_UID : current_uid());
2962 if (tb[RTA_IP_PROTO]) {
2963 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2964 &ip_proto, AF_INET, extack);
2970 sport = nla_get_be16(tb[RTA_SPORT]);
2973 dport = nla_get_be16(tb[RTA_DPORT]);
2975 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2981 fl4.flowi4_tos = rtm->rtm_tos;
2982 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2983 fl4.flowi4_mark = mark;
2984 fl4.flowi4_uid = uid;
2986 fl4.fl4_sport = sport;
2988 fl4.fl4_dport = dport;
2989 fl4.flowi4_proto = ip_proto;
2994 struct net_device *dev;
2996 dev = dev_get_by_index_rcu(net, iif);
3002 fl4.flowi4_iif = iif; /* for rt_fill_info */
3005 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3008 rt = skb_rtable(skb);
3009 if (err == 0 && rt->dst.error)
3010 err = -rt->dst.error;
3012 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3013 skb->dev = net->loopback_dev;
3014 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3019 skb_dst_set(skb, &rt->dst);
3025 if (rtm->rtm_flags & RTM_F_NOTIFY)
3026 rt->rt_flags |= RTCF_NOTIFY;
3028 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3029 table_id = res.table ? res.table->tb_id : 0;
3031 /* reset skb for netlink reply msg */
3033 skb_reset_network_header(skb);
3034 skb_reset_transport_header(skb);
3035 skb_reset_mac_header(skb);
3037 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3039 err = fib_props[res.type].error;
3041 err = -EHOSTUNREACH;
3044 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3045 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3046 rt->rt_type, res.prefix, res.prefixlen,
3047 fl4.flowi4_tos, res.fi, 0);
3049 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3050 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3057 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3067 void ip_rt_multicast_event(struct in_device *in_dev)
3069 rt_cache_flush(dev_net(in_dev->dev));
3072 #ifdef CONFIG_SYSCTL
3073 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3074 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3075 static int ip_rt_gc_elasticity __read_mostly = 8;
3076 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3078 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3079 void __user *buffer,
3080 size_t *lenp, loff_t *ppos)
3082 struct net *net = (struct net *)__ctl->extra1;
3085 rt_cache_flush(net);
3086 fnhe_genid_bump(net);
3093 static struct ctl_table ipv4_route_table[] = {
3095 .procname = "gc_thresh",
3096 .data = &ipv4_dst_ops.gc_thresh,
3097 .maxlen = sizeof(int),
3099 .proc_handler = proc_dointvec,
3102 .procname = "max_size",
3103 .data = &ip_rt_max_size,
3104 .maxlen = sizeof(int),
3106 .proc_handler = proc_dointvec,
3109 /* Deprecated. Use gc_min_interval_ms */
3111 .procname = "gc_min_interval",
3112 .data = &ip_rt_gc_min_interval,
3113 .maxlen = sizeof(int),
3115 .proc_handler = proc_dointvec_jiffies,
3118 .procname = "gc_min_interval_ms",
3119 .data = &ip_rt_gc_min_interval,
3120 .maxlen = sizeof(int),
3122 .proc_handler = proc_dointvec_ms_jiffies,
3125 .procname = "gc_timeout",
3126 .data = &ip_rt_gc_timeout,
3127 .maxlen = sizeof(int),
3129 .proc_handler = proc_dointvec_jiffies,
3132 .procname = "gc_interval",
3133 .data = &ip_rt_gc_interval,
3134 .maxlen = sizeof(int),
3136 .proc_handler = proc_dointvec_jiffies,
3139 .procname = "redirect_load",
3140 .data = &ip_rt_redirect_load,
3141 .maxlen = sizeof(int),
3143 .proc_handler = proc_dointvec,
3146 .procname = "redirect_number",
3147 .data = &ip_rt_redirect_number,
3148 .maxlen = sizeof(int),
3150 .proc_handler = proc_dointvec,
3153 .procname = "redirect_silence",
3154 .data = &ip_rt_redirect_silence,
3155 .maxlen = sizeof(int),
3157 .proc_handler = proc_dointvec,
3160 .procname = "error_cost",
3161 .data = &ip_rt_error_cost,
3162 .maxlen = sizeof(int),
3164 .proc_handler = proc_dointvec,
3167 .procname = "error_burst",
3168 .data = &ip_rt_error_burst,
3169 .maxlen = sizeof(int),
3171 .proc_handler = proc_dointvec,
3174 .procname = "gc_elasticity",
3175 .data = &ip_rt_gc_elasticity,
3176 .maxlen = sizeof(int),
3178 .proc_handler = proc_dointvec,
3181 .procname = "mtu_expires",
3182 .data = &ip_rt_mtu_expires,
3183 .maxlen = sizeof(int),
3185 .proc_handler = proc_dointvec_jiffies,
3188 .procname = "min_pmtu",
3189 .data = &ip_rt_min_pmtu,
3190 .maxlen = sizeof(int),
3192 .proc_handler = proc_dointvec_minmax,
3193 .extra1 = &ip_min_valid_pmtu,
3196 .procname = "min_adv_mss",
3197 .data = &ip_rt_min_advmss,
3198 .maxlen = sizeof(int),
3200 .proc_handler = proc_dointvec,
3205 static struct ctl_table ipv4_route_flush_table[] = {
3207 .procname = "flush",
3208 .maxlen = sizeof(int),
3210 .proc_handler = ipv4_sysctl_rtcache_flush,
3215 static __net_init int sysctl_route_net_init(struct net *net)
3217 struct ctl_table *tbl;
3219 tbl = ipv4_route_flush_table;
3220 if (!net_eq(net, &init_net)) {
3221 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3225 /* Don't export sysctls to unprivileged users */
3226 if (net->user_ns != &init_user_ns)
3227 tbl[0].procname = NULL;
3229 tbl[0].extra1 = net;
3231 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3232 if (!net->ipv4.route_hdr)
3237 if (tbl != ipv4_route_flush_table)
3243 static __net_exit void sysctl_route_net_exit(struct net *net)
3245 struct ctl_table *tbl;
3247 tbl = net->ipv4.route_hdr->ctl_table_arg;
3248 unregister_net_sysctl_table(net->ipv4.route_hdr);
3249 BUG_ON(tbl == ipv4_route_flush_table);
3253 static __net_initdata struct pernet_operations sysctl_route_ops = {
3254 .init = sysctl_route_net_init,
3255 .exit = sysctl_route_net_exit,
3259 static __net_init int rt_genid_init(struct net *net)
3261 atomic_set(&net->ipv4.rt_genid, 0);
3262 atomic_set(&net->fnhe_genid, 0);
3263 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3267 static __net_initdata struct pernet_operations rt_genid_ops = {
3268 .init = rt_genid_init,
3271 static int __net_init ipv4_inetpeer_init(struct net *net)
3273 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3277 inet_peer_base_init(bp);
3278 net->ipv4.peers = bp;
3282 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3284 struct inet_peer_base *bp = net->ipv4.peers;
3286 net->ipv4.peers = NULL;
3287 inetpeer_invalidate_tree(bp);
3291 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3292 .init = ipv4_inetpeer_init,
3293 .exit = ipv4_inetpeer_exit,
3296 #ifdef CONFIG_IP_ROUTE_CLASSID
3297 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3298 #endif /* CONFIG_IP_ROUTE_CLASSID */
3300 int __init ip_rt_init(void)
3304 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3307 panic("IP: failed to allocate ip_idents\n");
3309 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3311 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3313 panic("IP: failed to allocate ip_tstamps\n");
3315 for_each_possible_cpu(cpu) {
3316 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3318 INIT_LIST_HEAD(&ul->head);
3319 spin_lock_init(&ul->lock);
3321 #ifdef CONFIG_IP_ROUTE_CLASSID
3322 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3324 panic("IP: failed to allocate ip_rt_acct\n");
3327 ipv4_dst_ops.kmem_cachep =
3328 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3329 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3331 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3333 if (dst_entries_init(&ipv4_dst_ops) < 0)
3334 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3336 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3337 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3339 ipv4_dst_ops.gc_thresh = ~0;
3340 ip_rt_max_size = INT_MAX;
3345 if (ip_rt_proc_init())
3346 pr_err("Unable to create route proc files\n");
3351 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3352 RTNL_FLAG_DOIT_UNLOCKED);
3354 #ifdef CONFIG_SYSCTL
3355 register_pernet_subsys(&sysctl_route_ops);
3357 register_pernet_subsys(&rt_genid_ops);
3358 register_pernet_subsys(&ipv4_inetpeer_ops);
3362 #ifdef CONFIG_SYSCTL
3364 * We really need to sanitize the damn ipv4 init order, then all
3365 * this nonsense will go away.
3367 void __init ip_static_sysctl_init(void)
3369 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);