2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
99 #include <net/ip_fib.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
110 #include <net/secure_seq.h>
112 #define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115 #define RT_GC_TIMEOUT (300*HZ)
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly = 9;
119 static int ip_rt_redirect_load __read_mostly = HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly = HZ;
122 static int ip_rt_error_burst __read_mostly = 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly = 256;
128 * Interface to generic destination cache.
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void ipv4_link_failure(struct sk_buff *skb);
136 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137 struct sk_buff *skb, u32 mtu);
138 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139 struct sk_buff *skb);
140 static void ipv4_dst_destroy(struct dst_entry *dst);
142 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 static struct dst_ops ipv4_dst_ops = {
159 .protocol = cpu_to_be16(ETH_P_IP),
160 .check = ipv4_dst_check,
161 .default_advmss = ipv4_default_advmss,
163 .cow_metrics = ipv4_cow_metrics,
164 .destroy = ipv4_dst_destroy,
165 .ifdown = ipv4_dst_ifdown,
166 .negative_advice = ipv4_negative_advice,
167 .link_failure = ipv4_link_failure,
168 .update_pmtu = ip_rt_update_pmtu,
169 .redirect = ip_do_redirect,
170 .local_out = __ip_local_out,
171 .neigh_lookup = ipv4_neigh_lookup,
174 #define ECN_OR_COST(class) TC_PRIO_##class
176 const __u8 ip_tos2prio[16] = {
178 ECN_OR_COST(BESTEFFORT),
180 ECN_OR_COST(BESTEFFORT),
186 ECN_OR_COST(INTERACTIVE),
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
194 EXPORT_SYMBOL(ip_tos2prio);
196 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
197 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
199 #ifdef CONFIG_PROC_FS
200 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 return SEQ_START_TOKEN;
207 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
213 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 if (v == SEQ_START_TOKEN)
220 seq_printf(seq, "%-127s\n",
221 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
222 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
227 static const struct seq_operations rt_cache_seq_ops = {
228 .start = rt_cache_seq_start,
229 .next = rt_cache_seq_next,
230 .stop = rt_cache_seq_stop,
231 .show = rt_cache_seq_show,
234 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 return seq_open(file, &rt_cache_seq_ops);
239 static const struct file_operations rt_cache_seq_fops = {
240 .owner = THIS_MODULE,
241 .open = rt_cache_seq_open,
244 .release = seq_release,
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
253 return SEQ_START_TOKEN;
255 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256 if (!cpu_possible(cpu))
259 return &per_cpu(rt_cache_stat, cpu);
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269 if (!cpu_possible(cpu))
272 return &per_cpu(rt_cache_stat, cpu);
278 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
283 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 struct rt_cache_stat *st = v;
287 if (v == SEQ_START_TOKEN) {
288 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
293 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
294 dst_entries_get_slow(&ipv4_dst_ops),
307 0, /* st->gc_total */
308 0, /* st->gc_ignored */
309 0, /* st->gc_goal_miss */
310 0, /* st->gc_dst_overflow */
311 0, /* st->in_hlist_search */
312 0 /* st->out_hlist_search */
317 static const struct seq_operations rt_cpu_seq_ops = {
318 .start = rt_cpu_seq_start,
319 .next = rt_cpu_seq_next,
320 .stop = rt_cpu_seq_stop,
321 .show = rt_cpu_seq_show,
325 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 return seq_open(file, &rt_cpu_seq_ops);
330 static const struct file_operations rt_cpu_seq_fops = {
331 .owner = THIS_MODULE,
332 .open = rt_cpu_seq_open,
335 .release = seq_release,
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 struct ip_rt_acct *dst, *src;
344 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 for_each_possible_cpu(i) {
349 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 for (j = 0; j < 256; j++) {
351 dst[j].o_bytes += src[j].o_bytes;
352 dst[j].o_packets += src[j].o_packets;
353 dst[j].i_bytes += src[j].i_bytes;
354 dst[j].i_packets += src[j].i_packets;
358 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
363 static int rt_acct_proc_open(struct inode *inode, struct file *file)
365 return single_open(file, rt_acct_proc_show, NULL);
368 static const struct file_operations rt_acct_proc_fops = {
369 .owner = THIS_MODULE,
370 .open = rt_acct_proc_open,
373 .release = single_release,
377 static int __net_init ip_rt_do_proc_init(struct net *net)
379 struct proc_dir_entry *pde;
381 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
386 pde = proc_create("rt_cache", S_IRUGO,
387 net->proc_net_stat, &rt_cpu_seq_fops);
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
398 #ifdef CONFIG_IP_ROUTE_CLASSID
400 remove_proc_entry("rt_cache", net->proc_net_stat);
403 remove_proc_entry("rt_cache", net->proc_net);
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
410 remove_proc_entry("rt_cache", net->proc_net_stat);
411 remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413 remove_proc_entry("rt_acct", net->proc_net);
417 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
418 .init = ip_rt_do_proc_init,
419 .exit = ip_rt_do_proc_exit,
422 static int __init ip_rt_proc_init(void)
424 return register_pernet_subsys(&ip_rt_proc_ops);
428 static inline int ip_rt_proc_init(void)
432 #endif /* CONFIG_PROC_FS */
434 static inline bool rt_is_expired(const struct rtable *rth)
436 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
439 void rt_cache_flush(struct net *net)
441 rt_genid_bump_ipv4(net);
444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct net_device *dev = dst->dev;
449 const __be32 *pkey = daddr;
450 const struct rtable *rt;
453 rt = (const struct rtable *) dst;
455 pkey = (const __be32 *) &rt->rt_gateway;
457 pkey = &ip_hdr(skb)->daddr;
459 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 return neigh_create(&arp_tbl, pkey, dev);
466 * Peer allocation may fail only in serious out-of-memory conditions. However
467 * we still can generate some output.
468 * Random ID selection looks a bit dangerous because we have no chances to
469 * select ID being unique in a reasonable period of time.
470 * But broken packet identifier may be better than no packet at all.
472 static void ip_select_fb_ident(struct iphdr *iph)
474 static DEFINE_SPINLOCK(ip_fb_id_lock);
475 static u32 ip_fallback_id;
478 spin_lock_bh(&ip_fb_id_lock);
479 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
480 iph->id = htons(salt & 0xFFFF);
481 ip_fallback_id = salt;
482 spin_unlock_bh(&ip_fb_id_lock);
485 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
487 struct net *net = dev_net(dst->dev);
488 struct inet_peer *peer;
490 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
492 iph->id = htons(inet_getid(peer, more));
497 ip_select_fb_ident(iph);
499 EXPORT_SYMBOL(__ip_select_ident);
501 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
502 const struct iphdr *iph,
504 u8 prot, u32 mark, int flow_flags)
507 const struct inet_sock *inet = inet_sk(sk);
509 oif = sk->sk_bound_dev_if;
511 tos = RT_CONN_FLAGS(sk);
512 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
514 flowi4_init_output(fl4, oif, mark, tos,
515 RT_SCOPE_UNIVERSE, prot,
517 iph->daddr, iph->saddr, 0, 0);
520 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
521 const struct sock *sk)
523 const struct iphdr *iph = ip_hdr(skb);
524 int oif = skb->dev->ifindex;
525 u8 tos = RT_TOS(iph->tos);
526 u8 prot = iph->protocol;
527 u32 mark = skb->mark;
529 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
534 const struct inet_sock *inet = inet_sk(sk);
535 const struct ip_options_rcu *inet_opt;
536 __be32 daddr = inet->inet_daddr;
539 inet_opt = rcu_dereference(inet->inet_opt);
540 if (inet_opt && inet_opt->opt.srr)
541 daddr = inet_opt->opt.faddr;
542 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
543 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
544 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
545 inet_sk_flowi_flags(sk),
546 daddr, inet->inet_saddr, 0, 0);
550 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
551 const struct sk_buff *skb)
554 build_skb_flow_key(fl4, skb, sk);
556 build_sk_flow_key(fl4, sk);
559 static inline void rt_free(struct rtable *rt)
561 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 static DEFINE_SPINLOCK(fnhe_lock);
566 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
570 rt = rcu_dereference(fnhe->fnhe_rth_input);
572 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
575 rt = rcu_dereference(fnhe->fnhe_rth_output);
577 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
582 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584 struct fib_nh_exception *fnhe, *oldest;
586 oldest = rcu_dereference(hash->chain);
587 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
588 fnhe = rcu_dereference(fnhe->fnhe_next)) {
589 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592 fnhe_flush_routes(oldest);
596 static inline u32 fnhe_hashfun(__be32 daddr)
600 hval = (__force u32) daddr;
601 hval ^= (hval >> 11) ^ (hval >> 22);
603 return hval & (FNHE_HASH_SIZE - 1);
606 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
608 rt->rt_pmtu = fnhe->fnhe_pmtu;
609 rt->dst.expires = fnhe->fnhe_expires;
612 rt->rt_flags |= RTCF_REDIRECTED;
613 rt->rt_gateway = fnhe->fnhe_gw;
614 rt->rt_uses_gateway = 1;
618 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
619 u32 pmtu, unsigned long expires)
621 struct fnhe_hash_bucket *hash;
622 struct fib_nh_exception *fnhe;
626 u32 hval = fnhe_hashfun(daddr);
628 spin_lock_bh(&fnhe_lock);
630 hash = nh->nh_exceptions;
632 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
635 nh->nh_exceptions = hash;
641 for (fnhe = rcu_dereference(hash->chain); fnhe;
642 fnhe = rcu_dereference(fnhe->fnhe_next)) {
643 if (fnhe->fnhe_daddr == daddr)
652 fnhe->fnhe_pmtu = pmtu;
653 fnhe->fnhe_expires = max(1UL, expires);
655 /* Update all cached dsts too */
656 rt = rcu_dereference(fnhe->fnhe_rth_input);
658 fill_route_from_fnhe(rt, fnhe);
659 rt = rcu_dereference(fnhe->fnhe_rth_output);
661 fill_route_from_fnhe(rt, fnhe);
663 if (depth > FNHE_RECLAIM_DEPTH)
664 fnhe = fnhe_oldest(hash);
666 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
670 fnhe->fnhe_next = hash->chain;
671 rcu_assign_pointer(hash->chain, fnhe);
673 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
674 fnhe->fnhe_daddr = daddr;
676 fnhe->fnhe_pmtu = pmtu;
677 fnhe->fnhe_expires = expires;
679 /* Exception created; mark the cached routes for the nexthop
680 * stale, so anyone caching it rechecks if this exception
683 rt = rcu_dereference(nh->nh_rth_input);
685 rt->dst.obsolete = DST_OBSOLETE_KILL;
687 for_each_possible_cpu(i) {
688 struct rtable __rcu **prt;
689 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
690 rt = rcu_dereference(*prt);
692 rt->dst.obsolete = DST_OBSOLETE_KILL;
696 fnhe->fnhe_stamp = jiffies;
699 spin_unlock_bh(&fnhe_lock);
703 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
706 __be32 new_gw = icmp_hdr(skb)->un.gateway;
707 __be32 old_gw = ip_hdr(skb)->saddr;
708 struct net_device *dev = skb->dev;
709 struct in_device *in_dev;
710 struct fib_result res;
714 switch (icmp_hdr(skb)->code & 7) {
716 case ICMP_REDIR_NETTOS:
717 case ICMP_REDIR_HOST:
718 case ICMP_REDIR_HOSTTOS:
725 if (rt->rt_gateway != old_gw)
728 in_dev = __in_dev_get_rcu(dev);
733 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
734 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
735 ipv4_is_zeronet(new_gw))
736 goto reject_redirect;
738 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
739 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
740 goto reject_redirect;
741 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
742 goto reject_redirect;
744 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
745 goto reject_redirect;
748 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
750 if (!(n->nud_state & NUD_VALID)) {
751 neigh_event_send(n, NULL);
753 if (fib_lookup(net, fl4, &res) == 0) {
754 struct fib_nh *nh = &FIB_RES_NH(res);
756 update_or_create_fnhe(nh, fl4->daddr, new_gw,
760 rt->dst.obsolete = DST_OBSOLETE_KILL;
761 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
768 #ifdef CONFIG_IP_ROUTE_VERBOSE
769 if (IN_DEV_LOG_MARTIANS(in_dev)) {
770 const struct iphdr *iph = (const struct iphdr *) skb->data;
771 __be32 daddr = iph->daddr;
772 __be32 saddr = iph->saddr;
774 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
775 " Advised path = %pI4 -> %pI4\n",
776 &old_gw, dev->name, &new_gw,
783 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
787 const struct iphdr *iph = (const struct iphdr *) skb->data;
788 int oif = skb->dev->ifindex;
789 u8 tos = RT_TOS(iph->tos);
790 u8 prot = iph->protocol;
791 u32 mark = skb->mark;
793 rt = (struct rtable *) dst;
795 __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
796 __ip_do_redirect(rt, skb, &fl4, true);
799 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
801 struct rtable *rt = (struct rtable *)dst;
802 struct dst_entry *ret = dst;
805 if (dst->obsolete > 0) {
808 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
819 * 1. The first ip_rt_redirect_number redirects are sent
820 * with exponential backoff, then we stop sending them at all,
821 * assuming that the host ignores our redirects.
822 * 2. If we did not see packets requiring redirects
823 * during ip_rt_redirect_silence, we assume that the host
824 * forgot redirected route and start to send redirects again.
826 * This algorithm is much cheaper and more intelligent than dumb load limiting
829 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
830 * and "frag. need" (breaks PMTU discovery) in icmp.c.
833 void ip_rt_send_redirect(struct sk_buff *skb)
835 struct rtable *rt = skb_rtable(skb);
836 struct in_device *in_dev;
837 struct inet_peer *peer;
842 in_dev = __in_dev_get_rcu(rt->dst.dev);
843 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
847 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
850 net = dev_net(rt->dst.dev);
851 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
853 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
854 rt_nexthop(rt, ip_hdr(skb)->daddr));
858 /* No redirected packets during ip_rt_redirect_silence;
859 * reset the algorithm.
861 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
862 peer->rate_tokens = 0;
864 /* Too many ignored redirects; do not send anything
865 * set dst.rate_last to the last seen redirected packet.
867 if (peer->rate_tokens >= ip_rt_redirect_number) {
868 peer->rate_last = jiffies;
872 /* Check for load limit; set rate_last to the latest sent
875 if (peer->rate_tokens == 0 ||
878 (ip_rt_redirect_load << peer->rate_tokens)))) {
879 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
881 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
882 peer->rate_last = jiffies;
884 #ifdef CONFIG_IP_ROUTE_VERBOSE
886 peer->rate_tokens == ip_rt_redirect_number)
887 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
888 &ip_hdr(skb)->saddr, inet_iif(skb),
889 &ip_hdr(skb)->daddr, &gw);
896 static int ip_error(struct sk_buff *skb)
898 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
899 struct rtable *rt = skb_rtable(skb);
900 struct inet_peer *peer;
906 net = dev_net(rt->dst.dev);
907 if (!IN_DEV_FORWARD(in_dev)) {
908 switch (rt->dst.error) {
910 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
914 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
920 switch (rt->dst.error) {
925 code = ICMP_HOST_UNREACH;
928 code = ICMP_NET_UNREACH;
929 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
932 code = ICMP_PKT_FILTERED;
936 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
941 peer->rate_tokens += now - peer->rate_last;
942 if (peer->rate_tokens > ip_rt_error_burst)
943 peer->rate_tokens = ip_rt_error_burst;
944 peer->rate_last = now;
945 if (peer->rate_tokens >= ip_rt_error_cost)
946 peer->rate_tokens -= ip_rt_error_cost;
952 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
958 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
960 struct dst_entry *dst = &rt->dst;
961 struct fib_result res;
963 if (dst_metric_locked(dst, RTAX_MTU))
966 if (dst->dev->mtu < mtu)
969 if (mtu < ip_rt_min_pmtu)
970 mtu = ip_rt_min_pmtu;
972 if (rt->rt_pmtu == mtu &&
973 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
977 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
978 struct fib_nh *nh = &FIB_RES_NH(res);
980 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
981 jiffies + ip_rt_mtu_expires);
986 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
987 struct sk_buff *skb, u32 mtu)
989 struct rtable *rt = (struct rtable *) dst;
992 ip_rt_build_flow_key(&fl4, sk, skb);
993 __ip_rt_update_pmtu(rt, &fl4, mtu);
996 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
997 int oif, u32 mark, u8 protocol, int flow_flags)
999 const struct iphdr *iph = (const struct iphdr *) skb->data;
1003 __build_flow_key(&fl4, NULL, iph, oif,
1004 RT_TOS(iph->tos), protocol, mark, flow_flags);
1005 rt = __ip_route_output_key(net, &fl4);
1007 __ip_rt_update_pmtu(rt, &fl4, mtu);
1011 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1013 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1015 const struct iphdr *iph = (const struct iphdr *) skb->data;
1019 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1020 rt = __ip_route_output_key(sock_net(sk), &fl4);
1022 __ip_rt_update_pmtu(rt, &fl4, mtu);
1027 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1029 const struct iphdr *iph = (const struct iphdr *) skb->data;
1032 struct dst_entry *odst = NULL;
1037 if (!ip_sk_accept_pmtu(sk))
1040 odst = sk_dst_get(sk);
1042 if (sock_owned_by_user(sk) || !odst) {
1043 __ipv4_sk_update_pmtu(skb, sk, mtu);
1047 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1049 rt = (struct rtable *)odst;
1050 if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1051 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1060 if (!dst_check(&rt->dst, 0)) {
1062 dst_release(&rt->dst);
1064 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1072 sk_dst_set(sk, &rt->dst);
1078 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1080 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1081 int oif, u32 mark, u8 protocol, int flow_flags)
1083 const struct iphdr *iph = (const struct iphdr *) skb->data;
1087 __build_flow_key(&fl4, NULL, iph, oif,
1088 RT_TOS(iph->tos), protocol, mark, flow_flags);
1089 rt = __ip_route_output_key(net, &fl4);
1091 __ip_do_redirect(rt, skb, &fl4, false);
1095 EXPORT_SYMBOL_GPL(ipv4_redirect);
1097 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1099 const struct iphdr *iph = (const struct iphdr *) skb->data;
1103 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1104 rt = __ip_route_output_key(sock_net(sk), &fl4);
1106 __ip_do_redirect(rt, skb, &fl4, false);
1110 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1112 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1114 struct rtable *rt = (struct rtable *) dst;
1116 /* All IPV4 dsts are created with ->obsolete set to the value
1117 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1118 * into this function always.
1120 * When a PMTU/redirect information update invalidates a route,
1121 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1122 * DST_OBSOLETE_DEAD by dst_free().
1124 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1129 static void ipv4_link_failure(struct sk_buff *skb)
1133 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1135 rt = skb_rtable(skb);
1137 dst_set_expires(&rt->dst, 0);
1140 static int ip_rt_bug(struct sk_buff *skb)
1142 pr_debug("%s: %pI4 -> %pI4, %s\n",
1143 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1144 skb->dev ? skb->dev->name : "?");
1151 We do not cache source address of outgoing interface,
1152 because it is used only by IP RR, TS and SRR options,
1153 so that it out of fast path.
1155 BTW remember: "addr" is allowed to be not aligned
1159 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1163 if (rt_is_output_route(rt))
1164 src = ip_hdr(skb)->saddr;
1166 struct fib_result res;
1172 memset(&fl4, 0, sizeof(fl4));
1173 fl4.daddr = iph->daddr;
1174 fl4.saddr = iph->saddr;
1175 fl4.flowi4_tos = RT_TOS(iph->tos);
1176 fl4.flowi4_oif = rt->dst.dev->ifindex;
1177 fl4.flowi4_iif = skb->dev->ifindex;
1178 fl4.flowi4_mark = skb->mark;
1181 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1182 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1184 src = inet_select_addr(rt->dst.dev,
1185 rt_nexthop(rt, iph->daddr),
1189 memcpy(addr, &src, 4);
1192 #ifdef CONFIG_IP_ROUTE_CLASSID
1193 static void set_class_tag(struct rtable *rt, u32 tag)
1195 if (!(rt->dst.tclassid & 0xFFFF))
1196 rt->dst.tclassid |= tag & 0xFFFF;
1197 if (!(rt->dst.tclassid & 0xFFFF0000))
1198 rt->dst.tclassid |= tag & 0xFFFF0000;
1202 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1204 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1207 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1209 if (advmss > 65535 - 40)
1210 advmss = 65535 - 40;
1215 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1217 const struct rtable *rt = (const struct rtable *) dst;
1218 unsigned int mtu = rt->rt_pmtu;
1220 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1221 mtu = dst_metric_raw(dst, RTAX_MTU);
1226 mtu = dst->dev->mtu;
1228 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1229 if (rt->rt_uses_gateway && mtu > 576)
1233 return min_t(unsigned int, mtu, IP_MAX_MTU);
1236 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1238 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1239 struct fib_nh_exception *fnhe;
1245 hval = fnhe_hashfun(daddr);
1247 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1248 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1249 if (fnhe->fnhe_daddr == daddr)
1255 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1260 spin_lock_bh(&fnhe_lock);
1262 if (daddr == fnhe->fnhe_daddr) {
1263 struct rtable __rcu **porig;
1264 struct rtable *orig;
1265 int genid = fnhe_genid(dev_net(rt->dst.dev));
1267 if (rt_is_input_route(rt))
1268 porig = &fnhe->fnhe_rth_input;
1270 porig = &fnhe->fnhe_rth_output;
1271 orig = rcu_dereference(*porig);
1273 if (fnhe->fnhe_genid != genid) {
1274 fnhe->fnhe_genid = genid;
1276 fnhe->fnhe_pmtu = 0;
1277 fnhe->fnhe_expires = 0;
1278 fnhe_flush_routes(fnhe);
1281 fill_route_from_fnhe(rt, fnhe);
1282 if (!rt->rt_gateway)
1283 rt->rt_gateway = daddr;
1285 if (!(rt->dst.flags & DST_NOCACHE)) {
1286 rcu_assign_pointer(*porig, rt);
1292 fnhe->fnhe_stamp = jiffies;
1294 spin_unlock_bh(&fnhe_lock);
1299 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1301 struct rtable *orig, *prev, **p;
1304 if (rt_is_input_route(rt)) {
1305 p = (struct rtable **)&nh->nh_rth_input;
1307 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1311 prev = cmpxchg(p, orig, rt);
1321 static DEFINE_SPINLOCK(rt_uncached_lock);
1322 static LIST_HEAD(rt_uncached_list);
1324 static void rt_add_uncached_list(struct rtable *rt)
1326 spin_lock_bh(&rt_uncached_lock);
1327 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1328 spin_unlock_bh(&rt_uncached_lock);
1331 static void ipv4_dst_destroy(struct dst_entry *dst)
1333 struct rtable *rt = (struct rtable *) dst;
1335 if (!list_empty(&rt->rt_uncached)) {
1336 spin_lock_bh(&rt_uncached_lock);
1337 list_del(&rt->rt_uncached);
1338 spin_unlock_bh(&rt_uncached_lock);
1342 void rt_flush_dev(struct net_device *dev)
1344 if (!list_empty(&rt_uncached_list)) {
1345 struct net *net = dev_net(dev);
1348 spin_lock_bh(&rt_uncached_lock);
1349 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1350 if (rt->dst.dev != dev)
1352 rt->dst.dev = net->loopback_dev;
1353 dev_hold(rt->dst.dev);
1356 spin_unlock_bh(&rt_uncached_lock);
1360 static bool rt_cache_valid(const struct rtable *rt)
1363 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1367 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1368 const struct fib_result *res,
1369 struct fib_nh_exception *fnhe,
1370 struct fib_info *fi, u16 type, u32 itag)
1372 bool cached = false;
1375 struct fib_nh *nh = &FIB_RES_NH(*res);
1377 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1378 rt->rt_gateway = nh->nh_gw;
1379 rt->rt_uses_gateway = 1;
1381 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1382 #ifdef CONFIG_IP_ROUTE_CLASSID
1383 rt->dst.tclassid = nh->nh_tclassid;
1386 cached = rt_bind_exception(rt, fnhe, daddr);
1387 else if (!(rt->dst.flags & DST_NOCACHE))
1388 cached = rt_cache_route(nh, rt);
1389 if (unlikely(!cached)) {
1390 /* Routes we intend to cache in nexthop exception or
1391 * FIB nexthop have the DST_NOCACHE bit clear.
1392 * However, if we are unsuccessful at storing this
1393 * route into the cache we really need to set it.
1395 rt->dst.flags |= DST_NOCACHE;
1396 if (!rt->rt_gateway)
1397 rt->rt_gateway = daddr;
1398 rt_add_uncached_list(rt);
1401 rt_add_uncached_list(rt);
1403 #ifdef CONFIG_IP_ROUTE_CLASSID
1404 #ifdef CONFIG_IP_MULTIPLE_TABLES
1405 set_class_tag(rt, res->tclassid);
1407 set_class_tag(rt, itag);
1411 static struct rtable *rt_dst_alloc(struct net_device *dev,
1412 bool nopolicy, bool noxfrm, bool will_cache)
1414 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1415 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1416 (nopolicy ? DST_NOPOLICY : 0) |
1417 (noxfrm ? DST_NOXFRM : 0));
1420 /* called in rcu_read_lock() section */
1421 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1422 u8 tos, struct net_device *dev, int our)
1425 struct in_device *in_dev = __in_dev_get_rcu(dev);
1429 /* Primary sanity checks. */
1434 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1435 skb->protocol != htons(ETH_P_IP))
1438 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1439 if (ipv4_is_loopback(saddr))
1442 if (ipv4_is_zeronet(saddr)) {
1443 if (!ipv4_is_local_multicast(daddr))
1446 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1451 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1452 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1456 #ifdef CONFIG_IP_ROUTE_CLASSID
1457 rth->dst.tclassid = itag;
1459 rth->dst.output = ip_rt_bug;
1461 rth->rt_genid = rt_genid_ipv4(dev_net(dev));
1462 rth->rt_flags = RTCF_MULTICAST;
1463 rth->rt_type = RTN_MULTICAST;
1464 rth->rt_is_input= 1;
1467 rth->rt_gateway = 0;
1468 rth->rt_uses_gateway = 0;
1469 INIT_LIST_HEAD(&rth->rt_uncached);
1471 rth->dst.input= ip_local_deliver;
1472 rth->rt_flags |= RTCF_LOCAL;
1475 #ifdef CONFIG_IP_MROUTE
1476 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1477 rth->dst.input = ip_mr_input;
1479 RT_CACHE_STAT_INC(in_slow_mc);
1481 skb_dst_set(skb, &rth->dst);
1493 static void ip_handle_martian_source(struct net_device *dev,
1494 struct in_device *in_dev,
1495 struct sk_buff *skb,
1499 RT_CACHE_STAT_INC(in_martian_src);
1500 #ifdef CONFIG_IP_ROUTE_VERBOSE
1501 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1503 * RFC1812 recommendation, if source is martian,
1504 * the only hint is MAC header.
1506 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1507 &daddr, &saddr, dev->name);
1508 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1509 print_hex_dump(KERN_WARNING, "ll header: ",
1510 DUMP_PREFIX_OFFSET, 16, 1,
1511 skb_mac_header(skb),
1512 dev->hard_header_len, true);
1518 /* called in rcu_read_lock() section */
1519 static int __mkroute_input(struct sk_buff *skb,
1520 const struct fib_result *res,
1521 struct in_device *in_dev,
1522 __be32 daddr, __be32 saddr, u32 tos)
1524 struct fib_nh_exception *fnhe;
1527 struct in_device *out_dev;
1528 unsigned int flags = 0;
1532 /* get a working reference to the output device */
1533 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1534 if (out_dev == NULL) {
1535 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1539 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1540 in_dev->dev, in_dev, &itag);
1542 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1548 do_cache = res->fi && !itag;
1549 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1550 (IN_DEV_SHARED_MEDIA(out_dev) ||
1551 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1552 flags |= RTCF_DOREDIRECT;
1556 if (skb->protocol != htons(ETH_P_IP)) {
1557 /* Not IP (i.e. ARP). Do not create route, if it is
1558 * invalid for proxy arp. DNAT routes are always valid.
1560 * Proxy arp feature have been extended to allow, ARP
1561 * replies back to the same interface, to support
1562 * Private VLAN switch technologies. See arp.c.
1564 if (out_dev == in_dev &&
1565 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1571 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1574 rth = rcu_dereference(fnhe->fnhe_rth_input);
1576 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1578 if (rt_cache_valid(rth)) {
1579 skb_dst_set_noref(skb, &rth->dst);
1584 rth = rt_dst_alloc(out_dev->dev,
1585 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1586 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1592 rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1593 rth->rt_flags = flags;
1594 rth->rt_type = res->type;
1595 rth->rt_is_input = 1;
1598 rth->rt_gateway = 0;
1599 rth->rt_uses_gateway = 0;
1600 INIT_LIST_HEAD(&rth->rt_uncached);
1601 RT_CACHE_STAT_INC(in_slow_tot);
1603 rth->dst.input = ip_forward;
1604 rth->dst.output = ip_output;
1606 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1607 skb_dst_set(skb, &rth->dst);
1614 static int ip_mkroute_input(struct sk_buff *skb,
1615 struct fib_result *res,
1616 const struct flowi4 *fl4,
1617 struct in_device *in_dev,
1618 __be32 daddr, __be32 saddr, u32 tos)
1620 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1621 if (res->fi && res->fi->fib_nhs > 1)
1622 fib_select_multipath(res);
1625 /* create a routing cache entry */
1626 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1630 * NOTE. We drop all the packets that has local source
1631 * addresses, because every properly looped back packet
1632 * must have correct destination already attached by output routine.
1634 * Such approach solves two big problems:
1635 * 1. Not simplex devices are handled properly.
1636 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1637 * called with rcu_read_lock()
1640 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1641 u8 tos, struct net_device *dev)
1643 struct fib_result res;
1644 struct in_device *in_dev = __in_dev_get_rcu(dev);
1646 unsigned int flags = 0;
1650 struct net *net = dev_net(dev);
1653 /* IP on this device is disabled. */
1658 /* Check for the most weird martians, which can be not detected
1662 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1663 goto martian_source;
1666 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1669 /* Accept zero addresses only to limited broadcast;
1670 * I even do not know to fix it or not. Waiting for complains :-)
1672 if (ipv4_is_zeronet(saddr))
1673 goto martian_source;
1675 if (ipv4_is_zeronet(daddr))
1676 goto martian_destination;
1678 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1679 * and call it once if daddr or/and saddr are loopback addresses
1681 if (ipv4_is_loopback(daddr)) {
1682 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1683 goto martian_destination;
1684 } else if (ipv4_is_loopback(saddr)) {
1685 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1686 goto martian_source;
1690 * Now we are ready to route packet.
1693 fl4.flowi4_iif = dev->ifindex;
1694 fl4.flowi4_mark = skb->mark;
1695 fl4.flowi4_tos = tos;
1696 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1699 err = fib_lookup(net, &fl4, &res);
1701 if (!IN_DEV_FORWARD(in_dev))
1702 err = -EHOSTUNREACH;
1706 if (res.type == RTN_BROADCAST)
1709 if (res.type == RTN_LOCAL) {
1710 err = fib_validate_source(skb, saddr, daddr, tos,
1712 dev, in_dev, &itag);
1714 goto martian_source_keep_err;
1718 if (!IN_DEV_FORWARD(in_dev)) {
1719 err = -EHOSTUNREACH;
1722 if (res.type != RTN_UNICAST)
1723 goto martian_destination;
1725 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1729 if (skb->protocol != htons(ETH_P_IP))
1732 if (!ipv4_is_zeronet(saddr)) {
1733 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1736 goto martian_source_keep_err;
1738 flags |= RTCF_BROADCAST;
1739 res.type = RTN_BROADCAST;
1740 RT_CACHE_STAT_INC(in_brd);
1746 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1747 if (rt_cache_valid(rth)) {
1748 skb_dst_set_noref(skb, &rth->dst);
1756 rth = rt_dst_alloc(net->loopback_dev,
1757 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1761 rth->dst.input= ip_local_deliver;
1762 rth->dst.output= ip_rt_bug;
1763 #ifdef CONFIG_IP_ROUTE_CLASSID
1764 rth->dst.tclassid = itag;
1767 rth->rt_genid = rt_genid_ipv4(net);
1768 rth->rt_flags = flags|RTCF_LOCAL;
1769 rth->rt_type = res.type;
1770 rth->rt_is_input = 1;
1773 rth->rt_gateway = 0;
1774 rth->rt_uses_gateway = 0;
1775 INIT_LIST_HEAD(&rth->rt_uncached);
1776 RT_CACHE_STAT_INC(in_slow_tot);
1777 if (res.type == RTN_UNREACHABLE) {
1778 rth->dst.input= ip_error;
1779 rth->dst.error= -err;
1780 rth->rt_flags &= ~RTCF_LOCAL;
1783 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1784 rth->dst.flags |= DST_NOCACHE;
1785 rt_add_uncached_list(rth);
1788 skb_dst_set(skb, &rth->dst);
1793 RT_CACHE_STAT_INC(in_no_route);
1794 res.type = RTN_UNREACHABLE;
1800 * Do not cache martian addresses: they should be logged (RFC1812)
1802 martian_destination:
1803 RT_CACHE_STAT_INC(in_martian_dst);
1804 #ifdef CONFIG_IP_ROUTE_VERBOSE
1805 if (IN_DEV_LOG_MARTIANS(in_dev))
1806 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1807 &daddr, &saddr, dev->name);
1820 martian_source_keep_err:
1821 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1825 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1826 u8 tos, struct net_device *dev)
1832 /* Multicast recognition logic is moved from route cache to here.
1833 The problem was that too many Ethernet cards have broken/missing
1834 hardware multicast filters :-( As result the host on multicasting
1835 network acquires a lot of useless route cache entries, sort of
1836 SDR messages from all the world. Now we try to get rid of them.
1837 Really, provided software IP multicast filter is organized
1838 reasonably (at least, hashed), it does not result in a slowdown
1839 comparing with route cache reject entries.
1840 Note, that multicast routers are not affected, because
1841 route cache entry is created eventually.
1843 if (ipv4_is_multicast(daddr)) {
1844 struct in_device *in_dev = __in_dev_get_rcu(dev);
1847 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1848 ip_hdr(skb)->protocol);
1850 #ifdef CONFIG_IP_MROUTE
1852 (!ipv4_is_local_multicast(daddr) &&
1853 IN_DEV_MFORWARD(in_dev))
1856 int res = ip_route_input_mc(skb, daddr, saddr,
1865 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1869 EXPORT_SYMBOL(ip_route_input_noref);
1871 /* called with rcu_read_lock() */
1872 static struct rtable *__mkroute_output(const struct fib_result *res,
1873 const struct flowi4 *fl4, int orig_oif,
1874 struct net_device *dev_out,
1877 struct fib_info *fi = res->fi;
1878 struct fib_nh_exception *fnhe;
1879 struct in_device *in_dev;
1880 u16 type = res->type;
1884 in_dev = __in_dev_get_rcu(dev_out);
1886 return ERR_PTR(-EINVAL);
1888 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1889 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1890 return ERR_PTR(-EINVAL);
1892 if (ipv4_is_lbcast(fl4->daddr))
1893 type = RTN_BROADCAST;
1894 else if (ipv4_is_multicast(fl4->daddr))
1895 type = RTN_MULTICAST;
1896 else if (ipv4_is_zeronet(fl4->daddr))
1897 return ERR_PTR(-EINVAL);
1899 if (dev_out->flags & IFF_LOOPBACK)
1900 flags |= RTCF_LOCAL;
1903 if (type == RTN_BROADCAST) {
1904 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1906 } else if (type == RTN_MULTICAST) {
1907 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1908 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1910 flags &= ~RTCF_LOCAL;
1913 /* If multicast route do not exist use
1914 * default one, but do not gateway in this case.
1917 if (fi && res->prefixlen < 4)
1922 do_cache &= fi != NULL;
1924 struct rtable __rcu **prth;
1925 struct fib_nh *nh = &FIB_RES_NH(*res);
1927 fnhe = find_exception(nh, fl4->daddr);
1929 prth = &fnhe->fnhe_rth_output;
1931 if (unlikely(fl4->flowi4_flags &
1932 FLOWI_FLAG_KNOWN_NH &&
1934 nh->nh_scope == RT_SCOPE_LINK))) {
1938 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1940 rth = rcu_dereference(*prth);
1941 if (rt_cache_valid(rth)) {
1942 dst_hold(&rth->dst);
1948 rth = rt_dst_alloc(dev_out,
1949 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1950 IN_DEV_CONF_GET(in_dev, NOXFRM),
1953 return ERR_PTR(-ENOBUFS);
1955 rth->dst.output = ip_output;
1957 rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1958 rth->rt_flags = flags;
1959 rth->rt_type = type;
1960 rth->rt_is_input = 0;
1961 rth->rt_iif = orig_oif ? : 0;
1963 rth->rt_gateway = 0;
1964 rth->rt_uses_gateway = 0;
1965 INIT_LIST_HEAD(&rth->rt_uncached);
1967 RT_CACHE_STAT_INC(out_slow_tot);
1969 if (flags & RTCF_LOCAL)
1970 rth->dst.input = ip_local_deliver;
1971 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1972 if (flags & RTCF_LOCAL &&
1973 !(dev_out->flags & IFF_LOOPBACK)) {
1974 rth->dst.output = ip_mc_output;
1975 RT_CACHE_STAT_INC(out_slow_mc);
1977 #ifdef CONFIG_IP_MROUTE
1978 if (type == RTN_MULTICAST) {
1979 if (IN_DEV_MFORWARD(in_dev) &&
1980 !ipv4_is_local_multicast(fl4->daddr)) {
1981 rth->dst.input = ip_mr_input;
1982 rth->dst.output = ip_mc_output;
1988 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1994 * Major route resolver routine.
1997 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1999 struct net_device *dev_out = NULL;
2000 __u8 tos = RT_FL_TOS(fl4);
2001 unsigned int flags = 0;
2002 struct fib_result res;
2010 orig_oif = fl4->flowi4_oif;
2012 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2013 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2014 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2015 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2019 rth = ERR_PTR(-EINVAL);
2020 if (ipv4_is_multicast(fl4->saddr) ||
2021 ipv4_is_lbcast(fl4->saddr) ||
2022 ipv4_is_zeronet(fl4->saddr))
2025 /* I removed check for oif == dev_out->oif here.
2026 It was wrong for two reasons:
2027 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2028 is assigned to multiple interfaces.
2029 2. Moreover, we are allowed to send packets with saddr
2030 of another iface. --ANK
2033 if (fl4->flowi4_oif == 0 &&
2034 (ipv4_is_multicast(fl4->daddr) ||
2035 ipv4_is_lbcast(fl4->daddr))) {
2036 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2037 dev_out = __ip_dev_find(net, fl4->saddr, false);
2038 if (dev_out == NULL)
2041 /* Special hack: user can direct multicasts
2042 and limited broadcast via necessary interface
2043 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2044 This hack is not just for fun, it allows
2045 vic,vat and friends to work.
2046 They bind socket to loopback, set ttl to zero
2047 and expect that it will work.
2048 From the viewpoint of routing cache they are broken,
2049 because we are not allowed to build multicast path
2050 with loopback source addr (look, routing cache
2051 cannot know, that ttl is zero, so that packet
2052 will not leave this host and route is valid).
2053 Luckily, this hack is good workaround.
2056 fl4->flowi4_oif = dev_out->ifindex;
2060 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2061 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2062 if (!__ip_dev_find(net, fl4->saddr, false))
2068 if (fl4->flowi4_oif) {
2069 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2070 rth = ERR_PTR(-ENODEV);
2071 if (dev_out == NULL)
2074 /* RACE: Check return value of inet_select_addr instead. */
2075 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2076 rth = ERR_PTR(-ENETUNREACH);
2079 if (ipv4_is_local_multicast(fl4->daddr) ||
2080 ipv4_is_lbcast(fl4->daddr)) {
2082 fl4->saddr = inet_select_addr(dev_out, 0,
2087 if (ipv4_is_multicast(fl4->daddr))
2088 fl4->saddr = inet_select_addr(dev_out, 0,
2090 else if (!fl4->daddr)
2091 fl4->saddr = inet_select_addr(dev_out, 0,
2097 fl4->daddr = fl4->saddr;
2099 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2100 dev_out = net->loopback_dev;
2101 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2102 res.type = RTN_LOCAL;
2103 flags |= RTCF_LOCAL;
2107 if (fib_lookup(net, fl4, &res)) {
2110 if (fl4->flowi4_oif) {
2111 /* Apparently, routing tables are wrong. Assume,
2112 that the destination is on link.
2115 Because we are allowed to send to iface
2116 even if it has NO routes and NO assigned
2117 addresses. When oif is specified, routing
2118 tables are looked up with only one purpose:
2119 to catch if destination is gatewayed, rather than
2120 direct. Moreover, if MSG_DONTROUTE is set,
2121 we send packet, ignoring both routing tables
2122 and ifaddr state. --ANK
2125 We could make it even if oif is unknown,
2126 likely IPv6, but we do not.
2129 if (fl4->saddr == 0)
2130 fl4->saddr = inet_select_addr(dev_out, 0,
2132 res.type = RTN_UNICAST;
2135 rth = ERR_PTR(-ENETUNREACH);
2139 if (res.type == RTN_LOCAL) {
2141 if (res.fi->fib_prefsrc)
2142 fl4->saddr = res.fi->fib_prefsrc;
2144 fl4->saddr = fl4->daddr;
2146 dev_out = net->loopback_dev;
2147 fl4->flowi4_oif = dev_out->ifindex;
2148 flags |= RTCF_LOCAL;
2152 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2153 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2154 fib_select_multipath(&res);
2157 if (!res.prefixlen &&
2158 res.table->tb_num_default > 1 &&
2159 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2160 fib_select_default(&res);
2163 fl4->saddr = FIB_RES_PREFSRC(net, res);
2165 dev_out = FIB_RES_DEV(res);
2166 fl4->flowi4_oif = dev_out->ifindex;
2170 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2176 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2178 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2183 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2185 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2187 return mtu ? : dst->dev->mtu;
2190 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2191 struct sk_buff *skb, u32 mtu)
2195 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2196 struct sk_buff *skb)
2200 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2206 static struct dst_ops ipv4_dst_blackhole_ops = {
2208 .protocol = cpu_to_be16(ETH_P_IP),
2209 .check = ipv4_blackhole_dst_check,
2210 .mtu = ipv4_blackhole_mtu,
2211 .default_advmss = ipv4_default_advmss,
2212 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2213 .redirect = ipv4_rt_blackhole_redirect,
2214 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2215 .neigh_lookup = ipv4_neigh_lookup,
2218 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2220 struct rtable *ort = (struct rtable *) dst_orig;
2223 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2225 struct dst_entry *new = &rt->dst;
2228 new->input = dst_discard;
2229 new->output = dst_discard;
2231 new->dev = ort->dst.dev;
2235 rt->rt_is_input = ort->rt_is_input;
2236 rt->rt_iif = ort->rt_iif;
2237 rt->rt_pmtu = ort->rt_pmtu;
2239 rt->rt_genid = rt_genid_ipv4(net);
2240 rt->rt_flags = ort->rt_flags;
2241 rt->rt_type = ort->rt_type;
2242 rt->rt_gateway = ort->rt_gateway;
2243 rt->rt_uses_gateway = ort->rt_uses_gateway;
2245 INIT_LIST_HEAD(&rt->rt_uncached);
2250 dst_release(dst_orig);
2252 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2255 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2258 struct rtable *rt = __ip_route_output_key(net, flp4);
2263 if (flp4->flowi4_proto)
2264 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2265 flowi4_to_flowi(flp4),
2270 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2272 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2273 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2274 u32 seq, int event, int nowait, unsigned int flags)
2276 struct rtable *rt = skb_rtable(skb);
2278 struct nlmsghdr *nlh;
2279 unsigned long expires = 0;
2281 u32 metrics[RTAX_MAX];
2283 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2287 r = nlmsg_data(nlh);
2288 r->rtm_family = AF_INET;
2289 r->rtm_dst_len = 32;
2291 r->rtm_tos = fl4->flowi4_tos;
2292 r->rtm_table = RT_TABLE_MAIN;
2293 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2294 goto nla_put_failure;
2295 r->rtm_type = rt->rt_type;
2296 r->rtm_scope = RT_SCOPE_UNIVERSE;
2297 r->rtm_protocol = RTPROT_UNSPEC;
2298 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2299 if (rt->rt_flags & RTCF_NOTIFY)
2300 r->rtm_flags |= RTM_F_NOTIFY;
2302 if (nla_put_be32(skb, RTA_DST, dst))
2303 goto nla_put_failure;
2305 r->rtm_src_len = 32;
2306 if (nla_put_be32(skb, RTA_SRC, src))
2307 goto nla_put_failure;
2310 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2311 goto nla_put_failure;
2312 #ifdef CONFIG_IP_ROUTE_CLASSID
2313 if (rt->dst.tclassid &&
2314 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2315 goto nla_put_failure;
2317 if (!rt_is_input_route(rt) &&
2318 fl4->saddr != src) {
2319 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2320 goto nla_put_failure;
2322 if (rt->rt_uses_gateway &&
2323 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2324 goto nla_put_failure;
2326 expires = rt->dst.expires;
2328 unsigned long now = jiffies;
2330 if (time_before(now, expires))
2336 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2337 if (rt->rt_pmtu && expires)
2338 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2339 if (rtnetlink_put_metrics(skb, metrics) < 0)
2340 goto nla_put_failure;
2342 if (fl4->flowi4_mark &&
2343 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2344 goto nla_put_failure;
2346 error = rt->dst.error;
2348 if (rt_is_input_route(rt)) {
2349 #ifdef CONFIG_IP_MROUTE
2350 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2351 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2352 int err = ipmr_get_route(net, skb,
2353 fl4->saddr, fl4->daddr,
2359 goto nla_put_failure;
2361 if (err == -EMSGSIZE)
2362 goto nla_put_failure;
2368 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2369 goto nla_put_failure;
2372 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2373 goto nla_put_failure;
2375 return nlmsg_end(skb, nlh);
2378 nlmsg_cancel(skb, nlh);
2382 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2384 struct net *net = sock_net(in_skb->sk);
2386 struct nlattr *tb[RTA_MAX+1];
2387 struct rtable *rt = NULL;
2394 struct sk_buff *skb;
2396 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2400 rtm = nlmsg_data(nlh);
2402 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2408 /* Reserve room for dummy headers, this skb can pass
2409 through good chunk of routing engine.
2411 skb_reset_mac_header(skb);
2412 skb_reset_network_header(skb);
2414 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2415 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2416 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2418 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2419 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2420 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2421 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2423 memset(&fl4, 0, sizeof(fl4));
2426 fl4.flowi4_tos = rtm->rtm_tos;
2427 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2428 fl4.flowi4_mark = mark;
2431 struct net_device *dev;
2433 dev = __dev_get_by_index(net, iif);
2439 skb->protocol = htons(ETH_P_IP);
2443 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2446 rt = skb_rtable(skb);
2447 if (err == 0 && rt->dst.error)
2448 err = -rt->dst.error;
2450 rt = ip_route_output_key(net, &fl4);
2460 skb_dst_set(skb, &rt->dst);
2461 if (rtm->rtm_flags & RTM_F_NOTIFY)
2462 rt->rt_flags |= RTCF_NOTIFY;
2464 err = rt_fill_info(net, dst, src, &fl4, skb,
2465 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2466 RTM_NEWROUTE, 0, 0);
2470 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2479 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2484 void ip_rt_multicast_event(struct in_device *in_dev)
2486 rt_cache_flush(dev_net(in_dev->dev));
2489 #ifdef CONFIG_SYSCTL
2490 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2491 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2492 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2493 static int ip_rt_gc_elasticity __read_mostly = 8;
2495 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2496 void __user *buffer,
2497 size_t *lenp, loff_t *ppos)
2499 struct net *net = (struct net *)__ctl->extra1;
2502 rt_cache_flush(net);
2503 fnhe_genid_bump(net);
2510 static struct ctl_table ipv4_route_table[] = {
2512 .procname = "gc_thresh",
2513 .data = &ipv4_dst_ops.gc_thresh,
2514 .maxlen = sizeof(int),
2516 .proc_handler = proc_dointvec,
2519 .procname = "max_size",
2520 .data = &ip_rt_max_size,
2521 .maxlen = sizeof(int),
2523 .proc_handler = proc_dointvec,
2526 /* Deprecated. Use gc_min_interval_ms */
2528 .procname = "gc_min_interval",
2529 .data = &ip_rt_gc_min_interval,
2530 .maxlen = sizeof(int),
2532 .proc_handler = proc_dointvec_jiffies,
2535 .procname = "gc_min_interval_ms",
2536 .data = &ip_rt_gc_min_interval,
2537 .maxlen = sizeof(int),
2539 .proc_handler = proc_dointvec_ms_jiffies,
2542 .procname = "gc_timeout",
2543 .data = &ip_rt_gc_timeout,
2544 .maxlen = sizeof(int),
2546 .proc_handler = proc_dointvec_jiffies,
2549 .procname = "gc_interval",
2550 .data = &ip_rt_gc_interval,
2551 .maxlen = sizeof(int),
2553 .proc_handler = proc_dointvec_jiffies,
2556 .procname = "redirect_load",
2557 .data = &ip_rt_redirect_load,
2558 .maxlen = sizeof(int),
2560 .proc_handler = proc_dointvec,
2563 .procname = "redirect_number",
2564 .data = &ip_rt_redirect_number,
2565 .maxlen = sizeof(int),
2567 .proc_handler = proc_dointvec,
2570 .procname = "redirect_silence",
2571 .data = &ip_rt_redirect_silence,
2572 .maxlen = sizeof(int),
2574 .proc_handler = proc_dointvec,
2577 .procname = "error_cost",
2578 .data = &ip_rt_error_cost,
2579 .maxlen = sizeof(int),
2581 .proc_handler = proc_dointvec,
2584 .procname = "error_burst",
2585 .data = &ip_rt_error_burst,
2586 .maxlen = sizeof(int),
2588 .proc_handler = proc_dointvec,
2591 .procname = "gc_elasticity",
2592 .data = &ip_rt_gc_elasticity,
2593 .maxlen = sizeof(int),
2595 .proc_handler = proc_dointvec,
2598 .procname = "mtu_expires",
2599 .data = &ip_rt_mtu_expires,
2600 .maxlen = sizeof(int),
2602 .proc_handler = proc_dointvec_jiffies,
2605 .procname = "min_pmtu",
2606 .data = &ip_rt_min_pmtu,
2607 .maxlen = sizeof(int),
2609 .proc_handler = proc_dointvec,
2612 .procname = "min_adv_mss",
2613 .data = &ip_rt_min_advmss,
2614 .maxlen = sizeof(int),
2616 .proc_handler = proc_dointvec,
2621 static struct ctl_table ipv4_route_flush_table[] = {
2623 .procname = "flush",
2624 .maxlen = sizeof(int),
2626 .proc_handler = ipv4_sysctl_rtcache_flush,
2631 static __net_init int sysctl_route_net_init(struct net *net)
2633 struct ctl_table *tbl;
2635 tbl = ipv4_route_flush_table;
2636 if (!net_eq(net, &init_net)) {
2637 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2641 /* Don't export sysctls to unprivileged users */
2642 if (net->user_ns != &init_user_ns)
2643 tbl[0].procname = NULL;
2645 tbl[0].extra1 = net;
2647 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2648 if (net->ipv4.route_hdr == NULL)
2653 if (tbl != ipv4_route_flush_table)
2659 static __net_exit void sysctl_route_net_exit(struct net *net)
2661 struct ctl_table *tbl;
2663 tbl = net->ipv4.route_hdr->ctl_table_arg;
2664 unregister_net_sysctl_table(net->ipv4.route_hdr);
2665 BUG_ON(tbl == ipv4_route_flush_table);
2669 static __net_initdata struct pernet_operations sysctl_route_ops = {
2670 .init = sysctl_route_net_init,
2671 .exit = sysctl_route_net_exit,
2675 static __net_init int rt_genid_init(struct net *net)
2677 atomic_set(&net->ipv4.rt_genid, 0);
2678 atomic_set(&net->fnhe_genid, 0);
2679 get_random_bytes(&net->ipv4.dev_addr_genid,
2680 sizeof(net->ipv4.dev_addr_genid));
2684 static __net_initdata struct pernet_operations rt_genid_ops = {
2685 .init = rt_genid_init,
2688 static int __net_init ipv4_inetpeer_init(struct net *net)
2690 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2694 inet_peer_base_init(bp);
2695 net->ipv4.peers = bp;
2699 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2701 struct inet_peer_base *bp = net->ipv4.peers;
2703 net->ipv4.peers = NULL;
2704 inetpeer_invalidate_tree(bp);
2708 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2709 .init = ipv4_inetpeer_init,
2710 .exit = ipv4_inetpeer_exit,
2713 #ifdef CONFIG_IP_ROUTE_CLASSID
2714 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2715 #endif /* CONFIG_IP_ROUTE_CLASSID */
2717 int __init ip_rt_init(void)
2721 #ifdef CONFIG_IP_ROUTE_CLASSID
2722 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2724 panic("IP: failed to allocate ip_rt_acct\n");
2727 ipv4_dst_ops.kmem_cachep =
2728 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2729 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2731 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2733 if (dst_entries_init(&ipv4_dst_ops) < 0)
2734 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2736 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2737 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2739 ipv4_dst_ops.gc_thresh = ~0;
2740 ip_rt_max_size = INT_MAX;
2745 if (ip_rt_proc_init())
2746 pr_err("Unable to create route proc files\n");
2751 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2753 #ifdef CONFIG_SYSCTL
2754 register_pernet_subsys(&sysctl_route_ops);
2756 register_pernet_subsys(&rt_genid_ops);
2757 register_pernet_subsys(&ipv4_inetpeer_ops);
2761 #ifdef CONFIG_SYSCTL
2763 * We really need to sanitize the damn ipv4 init order, then all
2764 * this nonsense will go away.
2766 void __init ip_static_sysctl_init(void)
2768 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);