Initial commit
[kernel/linux-3.0.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly    = 8;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133 static int rt_chain_length_max __read_mostly    = 20;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(struct dst_ops *ops);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         struct rtable *rt = (struct rtable *) dst;
156         struct inet_peer *peer;
157         u32 *p = NULL;
158
159         if (!rt->peer)
160                 rt_bind_peer(rt, rt->rt_dst, 1);
161
162         peer = rt->peer;
163         if (peer) {
164                 u32 *old_p = __DST_METRICS_PTR(old);
165                 unsigned long prev, new;
166
167                 p = peer->metrics;
168                 if (inet_metrics_new(peer))
169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
170
171                 new = (unsigned long) p;
172                 prev = cmpxchg(&dst->_metrics, old, new);
173
174                 if (prev != old) {
175                         p = __DST_METRICS_PTR(prev);
176                         if (prev & DST_METRICS_READ_ONLY)
177                                 p = NULL;
178                 } else {
179                         if (rt->fi) {
180                                 fib_info_put(rt->fi);
181                                 rt->fi = NULL;
182                         }
183                 }
184         }
185         return p;
186 }
187
188 static struct dst_ops ipv4_dst_ops = {
189         .family =               AF_INET,
190         .protocol =             cpu_to_be16(ETH_P_IP),
191         .gc =                   rt_garbage_collect,
192         .check =                ipv4_dst_check,
193         .default_advmss =       ipv4_default_advmss,
194         .default_mtu =          ipv4_default_mtu,
195         .cow_metrics =          ipv4_cow_metrics,
196         .destroy =              ipv4_dst_destroy,
197         .ifdown =               ipv4_dst_ifdown,
198         .negative_advice =      ipv4_negative_advice,
199         .link_failure =         ipv4_link_failure,
200         .update_pmtu =          ip_rt_update_pmtu,
201         .local_out =            __ip_local_out,
202 };
203
204 #define ECN_OR_COST(class)      TC_PRIO_##class
205
206 const __u8 ip_tos2prio[16] = {
207         TC_PRIO_BESTEFFORT,
208         ECN_OR_COST(BESTEFFORT),
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(BESTEFFORT),
211         TC_PRIO_BULK,
212         ECN_OR_COST(BULK),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_INTERACTIVE,
216         ECN_OR_COST(INTERACTIVE),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE_BULK,
220         ECN_OR_COST(INTERACTIVE_BULK),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK)
223 };
224
225
226 /*
227  * Route cache.
228  */
229
230 /* The locking scheme is rather straight forward:
231  *
232  * 1) Read-Copy Update protects the buckets of the central route hash.
233  * 2) Only writers remove entries, and they hold the lock
234  *    as they look at rtable reference counts.
235  * 3) Only readers acquire references to rtable entries,
236  *    they do so with atomic increments and with the
237  *    lock held.
238  */
239
240 struct rt_hash_bucket {
241         struct rtable __rcu     *chain;
242 };
243
244 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245         defined(CONFIG_PROVE_LOCKING)
246 /*
247  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248  * The size of this table is a power of two and depends on the number of CPUS.
249  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
250  */
251 #ifdef CONFIG_LOCKDEP
252 # define RT_HASH_LOCK_SZ        256
253 #else
254 # if NR_CPUS >= 32
255 #  define RT_HASH_LOCK_SZ       4096
256 # elif NR_CPUS >= 16
257 #  define RT_HASH_LOCK_SZ       2048
258 # elif NR_CPUS >= 8
259 #  define RT_HASH_LOCK_SZ       1024
260 # elif NR_CPUS >= 4
261 #  define RT_HASH_LOCK_SZ       512
262 # else
263 #  define RT_HASH_LOCK_SZ       256
264 # endif
265 #endif
266
267 static spinlock_t       *rt_hash_locks;
268 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
269
270 static __init void rt_hash_lock_init(void)
271 {
272         int i;
273
274         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275                         GFP_KERNEL);
276         if (!rt_hash_locks)
277                 panic("IP: failed to allocate rt_hash_locks\n");
278
279         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280                 spin_lock_init(&rt_hash_locks[i]);
281 }
282 #else
283 # define rt_hash_lock_addr(slot) NULL
284
285 static inline void rt_hash_lock_init(void)
286 {
287 }
288 #endif
289
290 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
291 static unsigned                 rt_hash_mask __read_mostly;
292 static unsigned int             rt_hash_log  __read_mostly;
293
294 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
295 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
296
297 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
298                                    int genid)
299 {
300         return jhash_3words((__force u32)daddr, (__force u32)saddr,
301                             idx, genid)
302                 & rt_hash_mask;
303 }
304
305 static inline int rt_genid(struct net *net)
306 {
307         return atomic_read(&net->ipv4.rt_genid);
308 }
309
310 #ifdef CONFIG_PROC_FS
311 struct rt_cache_iter_state {
312         struct seq_net_private p;
313         int bucket;
314         int genid;
315 };
316
317 static struct rtable *rt_cache_get_first(struct seq_file *seq)
318 {
319         struct rt_cache_iter_state *st = seq->private;
320         struct rtable *r = NULL;
321
322         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
323                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
324                         continue;
325                 rcu_read_lock_bh();
326                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
327                 while (r) {
328                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
329                             r->rt_genid == st->genid)
330                                 return r;
331                         r = rcu_dereference_bh(r->dst.rt_next);
332                 }
333                 rcu_read_unlock_bh();
334         }
335         return r;
336 }
337
338 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
339                                           struct rtable *r)
340 {
341         struct rt_cache_iter_state *st = seq->private;
342
343         r = rcu_dereference_bh(r->dst.rt_next);
344         while (!r) {
345                 rcu_read_unlock_bh();
346                 do {
347                         if (--st->bucket < 0)
348                                 return NULL;
349                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
350                 rcu_read_lock_bh();
351                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
352         }
353         return r;
354 }
355
356 static struct rtable *rt_cache_get_next(struct seq_file *seq,
357                                         struct rtable *r)
358 {
359         struct rt_cache_iter_state *st = seq->private;
360         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
361                 if (dev_net(r->dst.dev) != seq_file_net(seq))
362                         continue;
363                 if (r->rt_genid == st->genid)
364                         break;
365         }
366         return r;
367 }
368
369 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
370 {
371         struct rtable *r = rt_cache_get_first(seq);
372
373         if (r)
374                 while (pos && (r = rt_cache_get_next(seq, r)))
375                         --pos;
376         return pos ? NULL : r;
377 }
378
379 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380 {
381         struct rt_cache_iter_state *st = seq->private;
382         if (*pos)
383                 return rt_cache_get_idx(seq, *pos - 1);
384         st->genid = rt_genid(seq_file_net(seq));
385         return SEQ_START_TOKEN;
386 }
387
388 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389 {
390         struct rtable *r;
391
392         if (v == SEQ_START_TOKEN)
393                 r = rt_cache_get_first(seq);
394         else
395                 r = rt_cache_get_next(seq, v);
396         ++*pos;
397         return r;
398 }
399
400 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401 {
402         if (v && v != SEQ_START_TOKEN)
403                 rcu_read_unlock_bh();
404 }
405
406 static int rt_cache_seq_show(struct seq_file *seq, void *v)
407 {
408         if (v == SEQ_START_TOKEN)
409                 seq_printf(seq, "%-127s\n",
410                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412                            "HHUptod\tSpecDst");
413         else {
414                 struct rtable *r = v;
415                 int len;
416
417                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
419                         r->dst.dev ? r->dst.dev->name : "*",
420                         (__force u32)r->rt_dst,
421                         (__force u32)r->rt_gateway,
422                         r->rt_flags, atomic_read(&r->dst.__refcnt),
423                         r->dst.__use, 0, (__force u32)r->rt_src,
424                         dst_metric_advmss(&r->dst) + 40,
425                         dst_metric(&r->dst, RTAX_WINDOW),
426                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427                               dst_metric(&r->dst, RTAX_RTTVAR)),
428                         r->rt_key_tos,
429                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430                         r->dst.hh ? (r->dst.hh->hh_output ==
431                                        dev_queue_xmit) : 0,
432                         r->rt_spec_dst, &len);
433
434                 seq_printf(seq, "%*s\n", 127 - len, "");
435         }
436         return 0;
437 }
438
439 static const struct seq_operations rt_cache_seq_ops = {
440         .start  = rt_cache_seq_start,
441         .next   = rt_cache_seq_next,
442         .stop   = rt_cache_seq_stop,
443         .show   = rt_cache_seq_show,
444 };
445
446 static int rt_cache_seq_open(struct inode *inode, struct file *file)
447 {
448         return seq_open_net(inode, file, &rt_cache_seq_ops,
449                         sizeof(struct rt_cache_iter_state));
450 }
451
452 static const struct file_operations rt_cache_seq_fops = {
453         .owner   = THIS_MODULE,
454         .open    = rt_cache_seq_open,
455         .read    = seq_read,
456         .llseek  = seq_lseek,
457         .release = seq_release_net,
458 };
459
460
461 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462 {
463         int cpu;
464
465         if (*pos == 0)
466                 return SEQ_START_TOKEN;
467
468         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
469                 if (!cpu_possible(cpu))
470                         continue;
471                 *pos = cpu+1;
472                 return &per_cpu(rt_cache_stat, cpu);
473         }
474         return NULL;
475 }
476
477 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478 {
479         int cpu;
480
481         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
482                 if (!cpu_possible(cpu))
483                         continue;
484                 *pos = cpu+1;
485                 return &per_cpu(rt_cache_stat, cpu);
486         }
487         return NULL;
488
489 }
490
491 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492 {
493
494 }
495
496 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497 {
498         struct rt_cache_stat *st = v;
499
500         if (v == SEQ_START_TOKEN) {
501                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
502                 return 0;
503         }
504
505         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
506                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
507                    dst_entries_get_slow(&ipv4_dst_ops),
508                    st->in_hit,
509                    st->in_slow_tot,
510                    st->in_slow_mc,
511                    st->in_no_route,
512                    st->in_brd,
513                    st->in_martian_dst,
514                    st->in_martian_src,
515
516                    st->out_hit,
517                    st->out_slow_tot,
518                    st->out_slow_mc,
519
520                    st->gc_total,
521                    st->gc_ignored,
522                    st->gc_goal_miss,
523                    st->gc_dst_overflow,
524                    st->in_hlist_search,
525                    st->out_hlist_search
526                 );
527         return 0;
528 }
529
530 static const struct seq_operations rt_cpu_seq_ops = {
531         .start  = rt_cpu_seq_start,
532         .next   = rt_cpu_seq_next,
533         .stop   = rt_cpu_seq_stop,
534         .show   = rt_cpu_seq_show,
535 };
536
537
538 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539 {
540         return seq_open(file, &rt_cpu_seq_ops);
541 }
542
543 static const struct file_operations rt_cpu_seq_fops = {
544         .owner   = THIS_MODULE,
545         .open    = rt_cpu_seq_open,
546         .read    = seq_read,
547         .llseek  = seq_lseek,
548         .release = seq_release,
549 };
550
551 #ifdef CONFIG_IP_ROUTE_CLASSID
552 static int rt_acct_proc_show(struct seq_file *m, void *v)
553 {
554         struct ip_rt_acct *dst, *src;
555         unsigned int i, j;
556
557         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558         if (!dst)
559                 return -ENOMEM;
560
561         for_each_possible_cpu(i) {
562                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563                 for (j = 0; j < 256; j++) {
564                         dst[j].o_bytes   += src[j].o_bytes;
565                         dst[j].o_packets += src[j].o_packets;
566                         dst[j].i_bytes   += src[j].i_bytes;
567                         dst[j].i_packets += src[j].i_packets;
568                 }
569         }
570
571         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572         kfree(dst);
573         return 0;
574 }
575
576 static int rt_acct_proc_open(struct inode *inode, struct file *file)
577 {
578         return single_open(file, rt_acct_proc_show, NULL);
579 }
580
581 static const struct file_operations rt_acct_proc_fops = {
582         .owner          = THIS_MODULE,
583         .open           = rt_acct_proc_open,
584         .read           = seq_read,
585         .llseek         = seq_lseek,
586         .release        = single_release,
587 };
588 #endif
589
590 static int __net_init ip_rt_do_proc_init(struct net *net)
591 {
592         struct proc_dir_entry *pde;
593
594         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595                         &rt_cache_seq_fops);
596         if (!pde)
597                 goto err1;
598
599         pde = proc_create("rt_cache", S_IRUGO,
600                           net->proc_net_stat, &rt_cpu_seq_fops);
601         if (!pde)
602                 goto err2;
603
604 #ifdef CONFIG_IP_ROUTE_CLASSID
605         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
606         if (!pde)
607                 goto err3;
608 #endif
609         return 0;
610
611 #ifdef CONFIG_IP_ROUTE_CLASSID
612 err3:
613         remove_proc_entry("rt_cache", net->proc_net_stat);
614 #endif
615 err2:
616         remove_proc_entry("rt_cache", net->proc_net);
617 err1:
618         return -ENOMEM;
619 }
620
621 static void __net_exit ip_rt_do_proc_exit(struct net *net)
622 {
623         remove_proc_entry("rt_cache", net->proc_net_stat);
624         remove_proc_entry("rt_cache", net->proc_net);
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626         remove_proc_entry("rt_acct", net->proc_net);
627 #endif
628 }
629
630 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
631         .init = ip_rt_do_proc_init,
632         .exit = ip_rt_do_proc_exit,
633 };
634
635 static int __init ip_rt_proc_init(void)
636 {
637         return register_pernet_subsys(&ip_rt_proc_ops);
638 }
639
640 #else
641 static inline int ip_rt_proc_init(void)
642 {
643         return 0;
644 }
645 #endif /* CONFIG_PROC_FS */
646
647 static inline void rt_free(struct rtable *rt)
648 {
649         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
650 }
651
652 static inline void rt_drop(struct rtable *rt)
653 {
654         ip_rt_put(rt);
655         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
656 }
657
658 static inline int rt_fast_clean(struct rtable *rth)
659 {
660         /* Kill broadcast/multicast entries very aggresively, if they
661            collide in hash table with more useful entries */
662         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
663                 rt_is_input_route(rth) && rth->dst.rt_next;
664 }
665
666 static inline int rt_valuable(struct rtable *rth)
667 {
668         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
669                 (rth->peer && rth->peer->pmtu_expires);
670 }
671
672 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673 {
674         unsigned long age;
675         int ret = 0;
676
677         if (atomic_read(&rth->dst.__refcnt))
678                 goto out;
679
680         age = jiffies - rth->dst.lastuse;
681         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682             (age <= tmo2 && rt_valuable(rth)))
683                 goto out;
684         ret = 1;
685 out:    return ret;
686 }
687
688 /* Bits of score are:
689  * 31: very valuable
690  * 30: not quite useless
691  * 29..0: usage counter
692  */
693 static inline u32 rt_score(struct rtable *rt)
694 {
695         u32 score = jiffies - rt->dst.lastuse;
696
697         score = ~score & ~(3<<30);
698
699         if (rt_valuable(rt))
700                 score |= (1<<31);
701
702         if (rt_is_output_route(rt) ||
703             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704                 score |= (1<<30);
705
706         return score;
707 }
708
709 static inline bool rt_caching(const struct net *net)
710 {
711         return net->ipv4.current_rt_cache_rebuild_count <=
712                 net->ipv4.sysctl_rt_cache_rebuild_count;
713 }
714
715 static inline bool compare_hash_inputs(const struct rtable *rt1,
716                                        const struct rtable *rt2)
717 {
718         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
720                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
721 }
722
723 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
724 {
725         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727                 (rt1->rt_mark ^ rt2->rt_mark) |
728                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
729                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
730                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
731 }
732
733 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734 {
735         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
736 }
737
738 static inline int rt_is_expired(struct rtable *rth)
739 {
740         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
741 }
742
743 /*
744  * Perform a full scan of hash table and free all entries.
745  * Can be called by a softirq or a process.
746  * In the later case, we want to be reschedule if necessary
747  */
748 static void rt_do_flush(struct net *net, int process_context)
749 {
750         unsigned int i;
751         struct rtable *rth, *next;
752
753         for (i = 0; i <= rt_hash_mask; i++) {
754                 struct rtable __rcu **pprev;
755                 struct rtable *list;
756
757                 if (process_context && need_resched())
758                         cond_resched();
759                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
760                 if (!rth)
761                         continue;
762
763                 spin_lock_bh(rt_hash_lock_addr(i));
764
765                 list = NULL;
766                 pprev = &rt_hash_table[i].chain;
767                 rth = rcu_dereference_protected(*pprev,
768                         lockdep_is_held(rt_hash_lock_addr(i)));
769
770                 while (rth) {
771                         next = rcu_dereference_protected(rth->dst.rt_next,
772                                 lockdep_is_held(rt_hash_lock_addr(i)));
773
774                         if (!net ||
775                             net_eq(dev_net(rth->dst.dev), net)) {
776                                 rcu_assign_pointer(*pprev, next);
777                                 rcu_assign_pointer(rth->dst.rt_next, list);
778                                 list = rth;
779                         } else {
780                                 pprev = &rth->dst.rt_next;
781                         }
782                         rth = next;
783                 }
784
785                 spin_unlock_bh(rt_hash_lock_addr(i));
786
787                 for (; list; list = next) {
788                         next = rcu_dereference_protected(list->dst.rt_next, 1);
789                         rt_free(list);
790                 }
791         }
792 }
793
794 /*
795  * While freeing expired entries, we compute average chain length
796  * and standard deviation, using fixed-point arithmetic.
797  * This to have an estimation of rt_chain_length_max
798  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
799  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800  */
801
802 #define FRACT_BITS 3
803 #define ONE (1UL << FRACT_BITS)
804
805 /*
806  * Given a hash chain and an item in this hash chain,
807  * find if a previous entry has the same hash_inputs
808  * (but differs on tos, mark or oif)
809  * Returns 0 if an alias is found.
810  * Returns ONE if rth has no alias before itself.
811  */
812 static int has_noalias(const struct rtable *head, const struct rtable *rth)
813 {
814         const struct rtable *aux = head;
815
816         while (aux != rth) {
817                 if (compare_hash_inputs(aux, rth))
818                         return 0;
819                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
820         }
821         return ONE;
822 }
823
824 /*
825  * Perturbation of rt_genid by a small quantity [1..256]
826  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827  * many times (2^24) without giving recent rt_genid.
828  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
829  */
830 static void rt_cache_invalidate(struct net *net)
831 {
832         unsigned char shuffle;
833
834         get_random_bytes(&shuffle, sizeof(shuffle));
835         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
836 }
837
838 /*
839  * delay < 0  : invalidate cache (fast : entries will be deleted later)
840  * delay >= 0 : invalidate & flush cache (can be long)
841  */
842 void rt_cache_flush(struct net *net, int delay)
843 {
844         rt_cache_invalidate(net);
845         if (delay >= 0)
846                 rt_do_flush(net, !in_softirq());
847 }
848
849 /* Flush previous cache invalidated entries from the cache */
850 void rt_cache_flush_batch(struct net *net)
851 {
852         rt_do_flush(net, !in_softirq());
853 }
854
855 static void rt_emergency_hash_rebuild(struct net *net)
856 {
857         if (net_ratelimit())
858                 printk(KERN_WARNING "Route hash chain too long!\n");
859         rt_cache_invalidate(net);
860 }
861
862 /*
863    Short description of GC goals.
864
865    We want to build algorithm, which will keep routing cache
866    at some equilibrium point, when number of aged off entries
867    is kept approximately equal to newly generated ones.
868
869    Current expiration strength is variable "expire".
870    We try to adjust it dynamically, so that if networking
871    is idle expires is large enough to keep enough of warm entries,
872    and when load increases it reduces to limit cache size.
873  */
874
875 static int rt_garbage_collect(struct dst_ops *ops)
876 {
877         static unsigned long expire = RT_GC_TIMEOUT;
878         static unsigned long last_gc;
879         static int rover;
880         static int equilibrium;
881         struct rtable *rth;
882         struct rtable __rcu **rthp;
883         unsigned long now = jiffies;
884         int goal;
885         int entries = dst_entries_get_fast(&ipv4_dst_ops);
886
887         /*
888          * Garbage collection is pretty expensive,
889          * do not make it too frequently.
890          */
891
892         RT_CACHE_STAT_INC(gc_total);
893
894         if (now - last_gc < ip_rt_gc_min_interval &&
895             entries < ip_rt_max_size) {
896                 RT_CACHE_STAT_INC(gc_ignored);
897                 goto out;
898         }
899
900         entries = dst_entries_get_slow(&ipv4_dst_ops);
901         /* Calculate number of entries, which we want to expire now. */
902         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
903         if (goal <= 0) {
904                 if (equilibrium < ipv4_dst_ops.gc_thresh)
905                         equilibrium = ipv4_dst_ops.gc_thresh;
906                 goal = entries - equilibrium;
907                 if (goal > 0) {
908                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
909                         goal = entries - equilibrium;
910                 }
911         } else {
912                 /* We are in dangerous area. Try to reduce cache really
913                  * aggressively.
914                  */
915                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
916                 equilibrium = entries - goal;
917         }
918
919         if (now - last_gc >= ip_rt_gc_min_interval)
920                 last_gc = now;
921
922         if (goal <= 0) {
923                 equilibrium += goal;
924                 goto work_done;
925         }
926
927         do {
928                 int i, k;
929
930                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931                         unsigned long tmo = expire;
932
933                         k = (k + 1) & rt_hash_mask;
934                         rthp = &rt_hash_table[k].chain;
935                         spin_lock_bh(rt_hash_lock_addr(k));
936                         while ((rth = rcu_dereference_protected(*rthp,
937                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
938                                 if (!rt_is_expired(rth) &&
939                                         !rt_may_expire(rth, tmo, expire)) {
940                                         tmo >>= 1;
941                                         rthp = &rth->dst.rt_next;
942                                         continue;
943                                 }
944                                 *rthp = rth->dst.rt_next;
945                                 rt_free(rth);
946                                 goal--;
947                         }
948                         spin_unlock_bh(rt_hash_lock_addr(k));
949                         if (goal <= 0)
950                                 break;
951                 }
952                 rover = k;
953
954                 if (goal <= 0)
955                         goto work_done;
956
957                 /* Goal is not achieved. We stop process if:
958
959                    - if expire reduced to zero. Otherwise, expire is halfed.
960                    - if table is not full.
961                    - if we are called from interrupt.
962                    - jiffies check is just fallback/debug loop breaker.
963                      We will not spin here for long time in any case.
964                  */
965
966                 RT_CACHE_STAT_INC(gc_goal_miss);
967
968                 if (expire == 0)
969                         break;
970
971                 expire >>= 1;
972
973                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
974                         goto out;
975         } while (!in_softirq() && time_before_eq(jiffies, now));
976
977         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978                 goto out;
979         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
980                 goto out;
981         if (net_ratelimit())
982                 printk(KERN_WARNING "dst cache overflow\n");
983         RT_CACHE_STAT_INC(gc_dst_overflow);
984         return 1;
985
986 work_done:
987         expire += ip_rt_gc_min_interval;
988         if (expire > ip_rt_gc_timeout ||
989             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
991                 expire = ip_rt_gc_timeout;
992 out:    return 0;
993 }
994
995 /*
996  * Returns number of entries in a hash chain that have different hash_inputs
997  */
998 static int slow_chain_length(const struct rtable *head)
999 {
1000         int length = 0;
1001         const struct rtable *rth = head;
1002
1003         while (rth) {
1004                 length += has_noalias(head, rth);
1005                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1006         }
1007         return length >> FRACT_BITS;
1008 }
1009
1010 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1011                                      struct sk_buff *skb, int ifindex)
1012 {
1013         struct rtable   *rth, *cand;
1014         struct rtable __rcu **rthp, **candp;
1015         unsigned long   now;
1016         u32             min_score;
1017         int             chain_length;
1018         int attempts = !in_softirq();
1019
1020 restart:
1021         chain_length = 0;
1022         min_score = ~(u32)0;
1023         cand = NULL;
1024         candp = NULL;
1025         now = jiffies;
1026
1027         if (!rt_caching(dev_net(rt->dst.dev))) {
1028                 /*
1029                  * If we're not caching, just tell the caller we
1030                  * were successful and don't touch the route.  The
1031                  * caller hold the sole reference to the cache entry, and
1032                  * it will be released when the caller is done with it.
1033                  * If we drop it here, the callers have no way to resolve routes
1034                  * when we're not caching.  Instead, just point *rp at rt, so
1035                  * the caller gets a single use out of the route
1036                  * Note that we do rt_free on this new route entry, so that
1037                  * once its refcount hits zero, we are still able to reap it
1038                  * (Thanks Alexey)
1039                  * Note: To avoid expensive rcu stuff for this uncached dst,
1040                  * we set DST_NOCACHE so that dst_release() can free dst without
1041                  * waiting a grace period.
1042                  */
1043
1044                 rt->dst.flags |= DST_NOCACHE;
1045                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1046                         int err = arp_bind_neighbour(&rt->dst);
1047                         if (err) {
1048                                 if (net_ratelimit())
1049                                         printk(KERN_WARNING
1050                                             "Neighbour table failure & not caching routes.\n");
1051                                 ip_rt_put(rt);
1052                                 return ERR_PTR(err);
1053                         }
1054                 }
1055
1056                 goto skip_hashing;
1057         }
1058
1059         rthp = &rt_hash_table[hash].chain;
1060
1061         spin_lock_bh(rt_hash_lock_addr(hash));
1062         while ((rth = rcu_dereference_protected(*rthp,
1063                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1064                 if (rt_is_expired(rth)) {
1065                         *rthp = rth->dst.rt_next;
1066                         rt_free(rth);
1067                         continue;
1068                 }
1069                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1070                         /* Put it first */
1071                         *rthp = rth->dst.rt_next;
1072                         /*
1073                          * Since lookup is lockfree, the deletion
1074                          * must be visible to another weakly ordered CPU before
1075                          * the insertion at the start of the hash chain.
1076                          */
1077                         rcu_assign_pointer(rth->dst.rt_next,
1078                                            rt_hash_table[hash].chain);
1079                         /*
1080                          * Since lookup is lockfree, the update writes
1081                          * must be ordered for consistency on SMP.
1082                          */
1083                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1084
1085                         dst_use(&rth->dst, now);
1086                         spin_unlock_bh(rt_hash_lock_addr(hash));
1087
1088                         rt_drop(rt);
1089                         if (skb)
1090                                 skb_dst_set(skb, &rth->dst);
1091                         return rth;
1092                 }
1093
1094                 if (!atomic_read(&rth->dst.__refcnt)) {
1095                         u32 score = rt_score(rth);
1096
1097                         if (score <= min_score) {
1098                                 cand = rth;
1099                                 candp = rthp;
1100                                 min_score = score;
1101                         }
1102                 }
1103
1104                 chain_length++;
1105
1106                 rthp = &rth->dst.rt_next;
1107         }
1108
1109         if (cand) {
1110                 /* ip_rt_gc_elasticity used to be average length of chain
1111                  * length, when exceeded gc becomes really aggressive.
1112                  *
1113                  * The second limit is less certain. At the moment it allows
1114                  * only 2 entries per bucket. We will see.
1115                  */
1116                 if (chain_length > ip_rt_gc_elasticity) {
1117                         *candp = cand->dst.rt_next;
1118                         rt_free(cand);
1119                 }
1120         } else {
1121                 if (chain_length > rt_chain_length_max &&
1122                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1123                         struct net *net = dev_net(rt->dst.dev);
1124                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1125                         if (!rt_caching(net)) {
1126                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1127                                         rt->dst.dev->name, num);
1128                         }
1129                         rt_emergency_hash_rebuild(net);
1130                         spin_unlock_bh(rt_hash_lock_addr(hash));
1131
1132                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1133                                         ifindex, rt_genid(net));
1134                         goto restart;
1135                 }
1136         }
1137
1138         /* Try to bind route to arp only if it is output
1139            route or unicast forwarding path.
1140          */
1141         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1142                 int err = arp_bind_neighbour(&rt->dst);
1143                 if (err) {
1144                         spin_unlock_bh(rt_hash_lock_addr(hash));
1145
1146                         if (err != -ENOBUFS) {
1147                                 rt_drop(rt);
1148                                 return ERR_PTR(err);
1149                         }
1150
1151                         /* Neighbour tables are full and nothing
1152                            can be released. Try to shrink route cache,
1153                            it is most likely it holds some neighbour records.
1154                          */
1155                         if (attempts-- > 0) {
1156                                 int saved_elasticity = ip_rt_gc_elasticity;
1157                                 int saved_int = ip_rt_gc_min_interval;
1158                                 ip_rt_gc_elasticity     = 1;
1159                                 ip_rt_gc_min_interval   = 0;
1160                                 rt_garbage_collect(&ipv4_dst_ops);
1161                                 ip_rt_gc_min_interval   = saved_int;
1162                                 ip_rt_gc_elasticity     = saved_elasticity;
1163                                 goto restart;
1164                         }
1165
1166                         if (net_ratelimit())
1167                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1168                         rt_drop(rt);
1169                         return ERR_PTR(-ENOBUFS);
1170                 }
1171         }
1172
1173         rt->dst.rt_next = rt_hash_table[hash].chain;
1174
1175         /*
1176          * Since lookup is lockfree, we must make sure
1177          * previous writes to rt are committed to memory
1178          * before making rt visible to other CPUS.
1179          */
1180         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1181
1182         spin_unlock_bh(rt_hash_lock_addr(hash));
1183
1184 skip_hashing:
1185         if (skb)
1186                 skb_dst_set(skb, &rt->dst);
1187         return rt;
1188 }
1189
1190 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1191
1192 static u32 rt_peer_genid(void)
1193 {
1194         return atomic_read(&__rt_peer_genid);
1195 }
1196
1197 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1198 {
1199         struct inet_peer *peer;
1200
1201         peer = inet_getpeer_v4(daddr, create);
1202
1203         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1204                 inet_putpeer(peer);
1205         else
1206                 rt->rt_peer_genid = rt_peer_genid();
1207 }
1208
1209 /*
1210  * Peer allocation may fail only in serious out-of-memory conditions.  However
1211  * we still can generate some output.
1212  * Random ID selection looks a bit dangerous because we have no chances to
1213  * select ID being unique in a reasonable period of time.
1214  * But broken packet identifier may be better than no packet at all.
1215  */
1216 static void ip_select_fb_ident(struct iphdr *iph)
1217 {
1218         static DEFINE_SPINLOCK(ip_fb_id_lock);
1219         static u32 ip_fallback_id;
1220         u32 salt;
1221
1222         spin_lock_bh(&ip_fb_id_lock);
1223         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1224         iph->id = htons(salt & 0xFFFF);
1225         ip_fallback_id = salt;
1226         spin_unlock_bh(&ip_fb_id_lock);
1227 }
1228
1229 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1230 {
1231         struct rtable *rt = (struct rtable *) dst;
1232
1233         if (rt) {
1234                 if (rt->peer == NULL)
1235                         rt_bind_peer(rt, rt->rt_dst, 1);
1236
1237                 /* If peer is attached to destination, it is never detached,
1238                    so that we need not to grab a lock to dereference it.
1239                  */
1240                 if (rt->peer) {
1241                         iph->id = htons(inet_getid(rt->peer, more));
1242                         return;
1243                 }
1244         } else
1245                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1246                        __builtin_return_address(0));
1247
1248         ip_select_fb_ident(iph);
1249 }
1250 EXPORT_SYMBOL(__ip_select_ident);
1251
1252 static void rt_del(unsigned hash, struct rtable *rt)
1253 {
1254         struct rtable __rcu **rthp;
1255         struct rtable *aux;
1256
1257         rthp = &rt_hash_table[hash].chain;
1258         spin_lock_bh(rt_hash_lock_addr(hash));
1259         ip_rt_put(rt);
1260         while ((aux = rcu_dereference_protected(*rthp,
1261                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1262                 if (aux == rt || rt_is_expired(aux)) {
1263                         *rthp = aux->dst.rt_next;
1264                         rt_free(aux);
1265                         continue;
1266                 }
1267                 rthp = &aux->dst.rt_next;
1268         }
1269         spin_unlock_bh(rt_hash_lock_addr(hash));
1270 }
1271
1272 /* called in rcu_read_lock() section */
1273 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1274                     __be32 saddr, struct net_device *dev)
1275 {
1276         struct in_device *in_dev = __in_dev_get_rcu(dev);
1277         struct inet_peer *peer;
1278         struct net *net;
1279
1280         if (!in_dev)
1281                 return;
1282
1283         net = dev_net(dev);
1284         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1285             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1286             ipv4_is_zeronet(new_gw))
1287                 goto reject_redirect;
1288
1289         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1290                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1291                         goto reject_redirect;
1292                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1293                         goto reject_redirect;
1294         } else {
1295                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1296                         goto reject_redirect;
1297         }
1298
1299         peer = inet_getpeer_v4(daddr, 1);
1300         if (peer) {
1301                 peer->redirect_learned.a4 = new_gw;
1302
1303                 inet_putpeer(peer);
1304
1305                 atomic_inc(&__rt_peer_genid);
1306         }
1307         return;
1308
1309 reject_redirect:
1310 #ifdef CONFIG_IP_ROUTE_VERBOSE
1311         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1312                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1313                         "  Advised path = %pI4 -> %pI4\n",
1314                        &old_gw, dev->name, &new_gw,
1315                        &saddr, &daddr);
1316 #endif
1317         ;
1318 }
1319
1320 static bool peer_pmtu_expired(struct inet_peer *peer)
1321 {
1322         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1323
1324         return orig &&
1325                time_after_eq(jiffies, orig) &&
1326                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1327 }
1328
1329 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1330 {
1331         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1332
1333         return orig &&
1334                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1335 }
1336
1337 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1338 {
1339         struct rtable *rt = (struct rtable *)dst;
1340         struct dst_entry *ret = dst;
1341
1342         if (rt) {
1343                 if (dst->obsolete > 0) {
1344                         ip_rt_put(rt);
1345                         ret = NULL;
1346                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1347                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1348                                                 rt->rt_oif,
1349                                                 rt_genid(dev_net(dst->dev)));
1350                         rt_del(hash, rt);
1351                         ret = NULL;
1352                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1353                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1354                 }
1355         }
1356         return ret;
1357 }
1358
1359 /*
1360  * Algorithm:
1361  *      1. The first ip_rt_redirect_number redirects are sent
1362  *         with exponential backoff, then we stop sending them at all,
1363  *         assuming that the host ignores our redirects.
1364  *      2. If we did not see packets requiring redirects
1365  *         during ip_rt_redirect_silence, we assume that the host
1366  *         forgot redirected route and start to send redirects again.
1367  *
1368  * This algorithm is much cheaper and more intelligent than dumb load limiting
1369  * in icmp.c.
1370  *
1371  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1372  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1373  */
1374
1375 void ip_rt_send_redirect(struct sk_buff *skb)
1376 {
1377         struct rtable *rt = skb_rtable(skb);
1378         struct in_device *in_dev;
1379         struct inet_peer *peer;
1380         int log_martians;
1381
1382         rcu_read_lock();
1383         in_dev = __in_dev_get_rcu(rt->dst.dev);
1384         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1385                 rcu_read_unlock();
1386                 return;
1387         }
1388         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1389         rcu_read_unlock();
1390
1391         if (!rt->peer)
1392                 rt_bind_peer(rt, rt->rt_dst, 1);
1393         peer = rt->peer;
1394         if (!peer) {
1395                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1396                 return;
1397         }
1398
1399         /* No redirected packets during ip_rt_redirect_silence;
1400          * reset the algorithm.
1401          */
1402         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1403                 peer->rate_tokens = 0;
1404
1405         /* Too many ignored redirects; do not send anything
1406          * set dst.rate_last to the last seen redirected packet.
1407          */
1408         if (peer->rate_tokens >= ip_rt_redirect_number) {
1409                 peer->rate_last = jiffies;
1410                 return;
1411         }
1412
1413         /* Check for load limit; set rate_last to the latest sent
1414          * redirect.
1415          */
1416         if (peer->rate_tokens == 0 ||
1417             time_after(jiffies,
1418                        (peer->rate_last +
1419                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1420                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1421                 peer->rate_last = jiffies;
1422                 ++peer->rate_tokens;
1423 #ifdef CONFIG_IP_ROUTE_VERBOSE
1424                 if (log_martians &&
1425                     peer->rate_tokens == ip_rt_redirect_number &&
1426                     net_ratelimit())
1427                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1428                                &ip_hdr(skb)->saddr, rt->rt_iif,
1429                                 &rt->rt_dst, &rt->rt_gateway);
1430 #endif
1431         }
1432 }
1433
1434 static int ip_error(struct sk_buff *skb)
1435 {
1436         struct rtable *rt = skb_rtable(skb);
1437         struct inet_peer *peer;
1438         unsigned long now;
1439         bool send;
1440         int code;
1441
1442         switch (rt->dst.error) {
1443                 case EINVAL:
1444                 default:
1445                         goto out;
1446                 case EHOSTUNREACH:
1447                         code = ICMP_HOST_UNREACH;
1448                         break;
1449                 case ENETUNREACH:
1450                         code = ICMP_NET_UNREACH;
1451                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1452                                         IPSTATS_MIB_INNOROUTES);
1453                         break;
1454                 case EACCES:
1455                         code = ICMP_PKT_FILTERED;
1456                         break;
1457         }
1458
1459         if (!rt->peer)
1460                 rt_bind_peer(rt, rt->rt_dst, 1);
1461         peer = rt->peer;
1462
1463         send = true;
1464         if (peer) {
1465                 now = jiffies;
1466                 peer->rate_tokens += now - peer->rate_last;
1467                 if (peer->rate_tokens > ip_rt_error_burst)
1468                         peer->rate_tokens = ip_rt_error_burst;
1469                 peer->rate_last = now;
1470                 if (peer->rate_tokens >= ip_rt_error_cost)
1471                         peer->rate_tokens -= ip_rt_error_cost;
1472                 else
1473                         send = false;
1474         }
1475         if (send)
1476                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1477
1478 out:    kfree_skb(skb);
1479         return 0;
1480 }
1481
1482 /*
1483  *      The last two values are not from the RFC but
1484  *      are needed for AMPRnet AX.25 paths.
1485  */
1486
1487 static const unsigned short mtu_plateau[] =
1488 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1489
1490 static inline unsigned short guess_mtu(unsigned short old_mtu)
1491 {
1492         int i;
1493
1494         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1495                 if (old_mtu > mtu_plateau[i])
1496                         return mtu_plateau[i];
1497         return 68;
1498 }
1499
1500 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1501                                  unsigned short new_mtu,
1502                                  struct net_device *dev)
1503 {
1504         unsigned short old_mtu = ntohs(iph->tot_len);
1505         unsigned short est_mtu = 0;
1506         struct inet_peer *peer;
1507
1508         peer = inet_getpeer_v4(iph->daddr, 1);
1509         if (peer) {
1510                 unsigned short mtu = new_mtu;
1511
1512                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1513                         /* BSD 4.2 derived systems incorrectly adjust
1514                          * tot_len by the IP header length, and report
1515                          * a zero MTU in the ICMP message.
1516                          */
1517                         if (mtu == 0 &&
1518                             old_mtu >= 68 + (iph->ihl << 2))
1519                                 old_mtu -= iph->ihl << 2;
1520                         mtu = guess_mtu(old_mtu);
1521                 }
1522
1523                 if (mtu < ip_rt_min_pmtu)
1524                         mtu = ip_rt_min_pmtu;
1525                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1526                         unsigned long pmtu_expires;
1527
1528                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1529                         if (!pmtu_expires)
1530                                 pmtu_expires = 1UL;
1531
1532                         est_mtu = mtu;
1533                         peer->pmtu_learned = mtu;
1534                         peer->pmtu_expires = pmtu_expires;
1535                 }
1536
1537                 inet_putpeer(peer);
1538
1539                 atomic_inc(&__rt_peer_genid);
1540         }
1541         return est_mtu ? : new_mtu;
1542 }
1543
1544 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1545 {
1546         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1547
1548         if (!expires)
1549                 return;
1550         if (time_before(jiffies, expires)) {
1551                 u32 orig_dst_mtu = dst_mtu(dst);
1552                 if (peer->pmtu_learned < orig_dst_mtu) {
1553                         if (!peer->pmtu_orig)
1554                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1555                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1556                 }
1557         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1558                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1559 }
1560
1561 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1562 {
1563         struct rtable *rt = (struct rtable *) dst;
1564         struct inet_peer *peer;
1565
1566         dst_confirm(dst);
1567
1568         if (!rt->peer)
1569                 rt_bind_peer(rt, rt->rt_dst, 1);
1570         peer = rt->peer;
1571         if (peer) {
1572                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1573
1574                 if (mtu < ip_rt_min_pmtu)
1575                         mtu = ip_rt_min_pmtu;
1576                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1577
1578                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1579                         if (!pmtu_expires)
1580                                 pmtu_expires = 1UL;
1581
1582                         peer->pmtu_learned = mtu;
1583                         peer->pmtu_expires = pmtu_expires;
1584
1585                         atomic_inc(&__rt_peer_genid);
1586                         rt->rt_peer_genid = rt_peer_genid();
1587                 }
1588                 check_peer_pmtu(dst, peer);
1589         }
1590 }
1591
1592 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1593 {
1594         struct rtable *rt = (struct rtable *) dst;
1595         __be32 orig_gw = rt->rt_gateway;
1596
1597         dst_confirm(&rt->dst);
1598
1599         neigh_release(rt->dst.neighbour);
1600         rt->dst.neighbour = NULL;
1601
1602         rt->rt_gateway = peer->redirect_learned.a4;
1603         if (arp_bind_neighbour(&rt->dst) ||
1604             !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1605                 if (rt->dst.neighbour)
1606                         neigh_event_send(rt->dst.neighbour, NULL);
1607                 rt->rt_gateway = orig_gw;
1608                 return -EAGAIN;
1609         } else {
1610                 rt->rt_flags |= RTCF_REDIRECTED;
1611                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1612                                         rt->dst.neighbour);
1613         }
1614         return 0;
1615 }
1616
1617 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1618 {
1619         struct rtable *rt = (struct rtable *) dst;
1620
1621         if (rt_is_expired(rt))
1622                 return NULL;
1623         if (rt->rt_peer_genid != rt_peer_genid()) {
1624                 struct inet_peer *peer;
1625
1626                 if (!rt->peer)
1627                         rt_bind_peer(rt, rt->rt_dst, 0);
1628
1629                 peer = rt->peer;
1630                 if (peer) {
1631                         check_peer_pmtu(dst, peer);
1632
1633                         if (peer->redirect_learned.a4 &&
1634                             peer->redirect_learned.a4 != rt->rt_gateway) {
1635                                 if (check_peer_redir(dst, peer))
1636                                         return NULL;
1637                         }
1638                 }
1639
1640                 rt->rt_peer_genid = rt_peer_genid();
1641         }
1642         return dst;
1643 }
1644
1645 static void ipv4_dst_destroy(struct dst_entry *dst)
1646 {
1647         struct rtable *rt = (struct rtable *) dst;
1648         struct inet_peer *peer = rt->peer;
1649
1650         if (rt->fi) {
1651                 fib_info_put(rt->fi);
1652                 rt->fi = NULL;
1653         }
1654         if (peer) {
1655                 rt->peer = NULL;
1656                 inet_putpeer(peer);
1657         }
1658 }
1659
1660
1661 static void ipv4_link_failure(struct sk_buff *skb)
1662 {
1663         struct rtable *rt;
1664
1665         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1666
1667         rt = skb_rtable(skb);
1668         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1669                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1670 }
1671
1672 static int ip_rt_bug(struct sk_buff *skb)
1673 {
1674         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1675                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1676                 skb->dev ? skb->dev->name : "?");
1677         kfree_skb(skb);
1678         WARN_ON(1);
1679         return 0;
1680 }
1681
1682 /*
1683    We do not cache source address of outgoing interface,
1684    because it is used only by IP RR, TS and SRR options,
1685    so that it out of fast path.
1686
1687    BTW remember: "addr" is allowed to be not aligned
1688    in IP options!
1689  */
1690
1691 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1692 {
1693         __be32 src;
1694
1695         if (rt_is_output_route(rt))
1696                 src = ip_hdr(skb)->saddr;
1697         else {
1698                 struct fib_result res;
1699                 struct flowi4 fl4;
1700                 struct iphdr *iph;
1701
1702                 iph = ip_hdr(skb);
1703
1704                 memset(&fl4, 0, sizeof(fl4));
1705                 fl4.daddr = iph->daddr;
1706                 fl4.saddr = iph->saddr;
1707                 fl4.flowi4_tos = RT_TOS(iph->tos);
1708                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1709                 fl4.flowi4_iif = skb->dev->ifindex;
1710                 fl4.flowi4_mark = skb->mark;
1711
1712                 rcu_read_lock();
1713                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1714                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1715                 else
1716                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1717                                         RT_SCOPE_UNIVERSE);
1718                 rcu_read_unlock();
1719         }
1720         memcpy(addr, &src, 4);
1721 }
1722
1723 #ifdef CONFIG_IP_ROUTE_CLASSID
1724 static void set_class_tag(struct rtable *rt, u32 tag)
1725 {
1726         if (!(rt->dst.tclassid & 0xFFFF))
1727                 rt->dst.tclassid |= tag & 0xFFFF;
1728         if (!(rt->dst.tclassid & 0xFFFF0000))
1729                 rt->dst.tclassid |= tag & 0xFFFF0000;
1730 }
1731 #endif
1732
1733 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1734 {
1735         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1736
1737         if (advmss == 0) {
1738                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1739                                ip_rt_min_advmss);
1740                 if (advmss > 65535 - 40)
1741                         advmss = 65535 - 40;
1742         }
1743         return advmss;
1744 }
1745
1746 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1747 {
1748         unsigned int mtu = dst->dev->mtu;
1749
1750         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1751                 const struct rtable *rt = (const struct rtable *) dst;
1752
1753                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1754                         mtu = 576;
1755         }
1756
1757         if (mtu > IP_MAX_MTU)
1758                 mtu = IP_MAX_MTU;
1759
1760         return mtu;
1761 }
1762
1763 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1764                             struct fib_info *fi)
1765 {
1766         struct inet_peer *peer;
1767         int create = 0;
1768
1769         /* If a peer entry exists for this destination, we must hook
1770          * it up in order to get at cached metrics.
1771          */
1772         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1773                 create = 1;
1774
1775         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1776         if (peer) {
1777                 rt->rt_peer_genid = rt_peer_genid();
1778                 if (inet_metrics_new(peer))
1779                         memcpy(peer->metrics, fi->fib_metrics,
1780                                sizeof(u32) * RTAX_MAX);
1781                 dst_init_metrics(&rt->dst, peer->metrics, false);
1782
1783                 check_peer_pmtu(&rt->dst, peer);
1784                 if (peer->redirect_learned.a4 &&
1785                     peer->redirect_learned.a4 != rt->rt_gateway) {
1786                         rt->rt_gateway = peer->redirect_learned.a4;
1787                         rt->rt_flags |= RTCF_REDIRECTED;
1788                 }
1789         } else {
1790                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1791                         rt->fi = fi;
1792                         atomic_inc(&fi->fib_clntref);
1793                 }
1794                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1795         }
1796 }
1797
1798 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1799                            const struct fib_result *res,
1800                            struct fib_info *fi, u16 type, u32 itag)
1801 {
1802         struct dst_entry *dst = &rt->dst;
1803
1804         if (fi) {
1805                 if (FIB_RES_GW(*res) &&
1806                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807                         rt->rt_gateway = FIB_RES_GW(*res);
1808                 rt_init_metrics(rt, fl4, fi);
1809 #ifdef CONFIG_IP_ROUTE_CLASSID
1810                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1811 #endif
1812         }
1813
1814         if (dst_mtu(dst) > IP_MAX_MTU)
1815                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1816         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1817                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1818
1819 #ifdef CONFIG_IP_ROUTE_CLASSID
1820 #ifdef CONFIG_IP_MULTIPLE_TABLES
1821         set_class_tag(rt, fib_rules_tclass(res));
1822 #endif
1823         set_class_tag(rt, itag);
1824 #endif
1825 }
1826
1827 static struct rtable *rt_dst_alloc(struct net_device *dev,
1828                                    bool nopolicy, bool noxfrm)
1829 {
1830         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1831                          DST_HOST |
1832                          (nopolicy ? DST_NOPOLICY : 0) |
1833                          (noxfrm ? DST_NOXFRM : 0));
1834 }
1835
1836 /* called in rcu_read_lock() section */
1837 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838                                 u8 tos, struct net_device *dev, int our)
1839 {
1840         unsigned int hash;
1841         struct rtable *rth;
1842         __be32 spec_dst;
1843         struct in_device *in_dev = __in_dev_get_rcu(dev);
1844         u32 itag = 0;
1845         int err;
1846
1847         /* Primary sanity checks. */
1848
1849         if (in_dev == NULL)
1850                 return -EINVAL;
1851
1852         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1853             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1854                 goto e_inval;
1855
1856         if (ipv4_is_zeronet(saddr)) {
1857                 if (!ipv4_is_local_multicast(daddr))
1858                         goto e_inval;
1859                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1860         } else {
1861                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1862                                           &itag);
1863                 if (err < 0)
1864                         goto e_err;
1865         }
1866         rth = rt_dst_alloc(init_net.loopback_dev,
1867                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1868         if (!rth)
1869                 goto e_nobufs;
1870
1871 #ifdef CONFIG_IP_ROUTE_CLASSID
1872         rth->dst.tclassid = itag;
1873 #endif
1874         rth->dst.output = ip_rt_bug;
1875
1876         rth->rt_key_dst = daddr;
1877         rth->rt_key_src = saddr;
1878         rth->rt_genid   = rt_genid(dev_net(dev));
1879         rth->rt_flags   = RTCF_MULTICAST;
1880         rth->rt_type    = RTN_MULTICAST;
1881         rth->rt_key_tos = tos;
1882         rth->rt_dst     = daddr;
1883         rth->rt_src     = saddr;
1884         rth->rt_route_iif = dev->ifindex;
1885         rth->rt_iif     = dev->ifindex;
1886         rth->rt_oif     = 0;
1887         rth->rt_mark    = skb->mark;
1888         rth->rt_gateway = daddr;
1889         rth->rt_spec_dst= spec_dst;
1890         rth->rt_peer_genid = 0;
1891         rth->peer = NULL;
1892         rth->fi = NULL;
1893         if (our) {
1894                 rth->dst.input= ip_local_deliver;
1895                 rth->rt_flags |= RTCF_LOCAL;
1896         }
1897
1898 #ifdef CONFIG_IP_MROUTE
1899         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900                 rth->dst.input = ip_mr_input;
1901 #endif
1902         RT_CACHE_STAT_INC(in_slow_mc);
1903
1904         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1905         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1906         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1907
1908 e_nobufs:
1909         return -ENOBUFS;
1910 e_inval:
1911         return -EINVAL;
1912 e_err:
1913         return err;
1914 }
1915
1916
1917 static void ip_handle_martian_source(struct net_device *dev,
1918                                      struct in_device *in_dev,
1919                                      struct sk_buff *skb,
1920                                      __be32 daddr,
1921                                      __be32 saddr)
1922 {
1923         RT_CACHE_STAT_INC(in_martian_src);
1924 #ifdef CONFIG_IP_ROUTE_VERBOSE
1925         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1926                 /*
1927                  *      RFC1812 recommendation, if source is martian,
1928                  *      the only hint is MAC header.
1929                  */
1930                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1931                         &daddr, &saddr, dev->name);
1932                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1933                         int i;
1934                         const unsigned char *p = skb_mac_header(skb);
1935                         printk(KERN_WARNING "ll header: ");
1936                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1937                                 printk("%02x", *p);
1938                                 if (i < (dev->hard_header_len - 1))
1939                                         printk(":");
1940                         }
1941                         printk("\n");
1942                 }
1943         }
1944 #endif
1945 }
1946
1947 /* called in rcu_read_lock() section */
1948 static int __mkroute_input(struct sk_buff *skb,
1949                            const struct fib_result *res,
1950                            struct in_device *in_dev,
1951                            __be32 daddr, __be32 saddr, u32 tos,
1952                            struct rtable **result)
1953 {
1954         struct rtable *rth;
1955         int err;
1956         struct in_device *out_dev;
1957         unsigned int flags = 0;
1958         __be32 spec_dst;
1959         u32 itag;
1960
1961         /* get a working reference to the output device */
1962         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1963         if (out_dev == NULL) {
1964                 if (net_ratelimit())
1965                         printk(KERN_CRIT "Bug in ip_route_input" \
1966                                "_slow(). Please, report\n");
1967                 return -EINVAL;
1968         }
1969
1970
1971         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1972                                   in_dev->dev, &spec_dst, &itag);
1973         if (err < 0) {
1974                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1975                                          saddr);
1976
1977                 goto cleanup;
1978         }
1979
1980         if (err)
1981                 flags |= RTCF_DIRECTSRC;
1982
1983         if (out_dev == in_dev && err &&
1984             (IN_DEV_SHARED_MEDIA(out_dev) ||
1985              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986                 flags |= RTCF_DOREDIRECT;
1987
1988         if (skb->protocol != htons(ETH_P_IP)) {
1989                 /* Not IP (i.e. ARP). Do not create route, if it is
1990                  * invalid for proxy arp. DNAT routes are always valid.
1991                  *
1992                  * Proxy arp feature have been extended to allow, ARP
1993                  * replies back to the same interface, to support
1994                  * Private VLAN switch technologies. See arp.c.
1995                  */
1996                 if (out_dev == in_dev &&
1997                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1998                         err = -EINVAL;
1999                         goto cleanup;
2000                 }
2001         }
2002
2003         rth = rt_dst_alloc(out_dev->dev,
2004                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2005                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2006         if (!rth) {
2007                 err = -ENOBUFS;
2008                 goto cleanup;
2009         }
2010
2011         rth->rt_key_dst = daddr;
2012         rth->rt_key_src = saddr;
2013         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2014         rth->rt_flags = flags;
2015         rth->rt_type = res->type;
2016         rth->rt_key_tos = tos;
2017         rth->rt_dst     = daddr;
2018         rth->rt_src     = saddr;
2019         rth->rt_route_iif = in_dev->dev->ifindex;
2020         rth->rt_iif     = in_dev->dev->ifindex;
2021         rth->rt_oif     = 0;
2022         rth->rt_mark    = skb->mark;
2023         rth->rt_gateway = daddr;
2024         rth->rt_spec_dst= spec_dst;
2025         rth->rt_peer_genid = 0;
2026         rth->peer = NULL;
2027         rth->fi = NULL;
2028
2029         rth->dst.input = ip_forward;
2030         rth->dst.output = ip_output;
2031
2032         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2033
2034         *result = rth;
2035         err = 0;
2036  cleanup:
2037         return err;
2038 }
2039
2040 static int ip_mkroute_input(struct sk_buff *skb,
2041                             struct fib_result *res,
2042                             const struct flowi4 *fl4,
2043                             struct in_device *in_dev,
2044                             __be32 daddr, __be32 saddr, u32 tos)
2045 {
2046         struct rtable* rth = NULL;
2047         int err;
2048         unsigned hash;
2049
2050 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2051         if (res->fi && res->fi->fib_nhs > 1)
2052                 fib_select_multipath(res);
2053 #endif
2054
2055         /* create a routing cache entry */
2056         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2057         if (err)
2058                 return err;
2059
2060         /* put it into the cache */
2061         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2062                        rt_genid(dev_net(rth->dst.dev)));
2063         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2064         if (IS_ERR(rth))
2065                 return PTR_ERR(rth);
2066         return 0;
2067 }
2068
2069 /*
2070  *      NOTE. We drop all the packets that has local source
2071  *      addresses, because every properly looped back packet
2072  *      must have correct destination already attached by output routine.
2073  *
2074  *      Such approach solves two big problems:
2075  *      1. Not simplex devices are handled properly.
2076  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2077  *      called with rcu_read_lock()
2078  */
2079
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081                                u8 tos, struct net_device *dev)
2082 {
2083         struct fib_result res;
2084         struct in_device *in_dev = __in_dev_get_rcu(dev);
2085         struct flowi4   fl4;
2086         unsigned        flags = 0;
2087         u32             itag = 0;
2088         struct rtable * rth;
2089         unsigned        hash;
2090         __be32          spec_dst;
2091         int             err = -EINVAL;
2092         struct net    * net = dev_net(dev);
2093
2094         /* IP on this device is disabled. */
2095
2096         if (!in_dev)
2097                 goto out;
2098
2099         /* Check for the most weird martians, which can be not detected
2100            by fib_lookup.
2101          */
2102
2103         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2104             ipv4_is_loopback(saddr))
2105                 goto martian_source;
2106
2107         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2108                 goto brd_input;
2109
2110         /* Accept zero addresses only to limited broadcast;
2111          * I even do not know to fix it or not. Waiting for complains :-)
2112          */
2113         if (ipv4_is_zeronet(saddr))
2114                 goto martian_source;
2115
2116         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2117                 goto martian_destination;
2118
2119         /*
2120          *      Now we are ready to route packet.
2121          */
2122         fl4.flowi4_oif = 0;
2123         fl4.flowi4_iif = dev->ifindex;
2124         fl4.flowi4_mark = skb->mark;
2125         fl4.flowi4_tos = tos;
2126         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2127         fl4.daddr = daddr;
2128         fl4.saddr = saddr;
2129         err = fib_lookup(net, &fl4, &res);
2130         if (err != 0) {
2131                 if (!IN_DEV_FORWARD(in_dev))
2132                         goto e_hostunreach;
2133                 goto no_route;
2134         }
2135
2136         RT_CACHE_STAT_INC(in_slow_tot);
2137
2138         if (res.type == RTN_BROADCAST)
2139                 goto brd_input;
2140
2141         if (res.type == RTN_LOCAL) {
2142                 err = fib_validate_source(skb, saddr, daddr, tos,
2143                                           net->loopback_dev->ifindex,
2144                                           dev, &spec_dst, &itag);
2145                 if (err < 0)
2146                         goto martian_source_keep_err;
2147                 if (err)
2148                         flags |= RTCF_DIRECTSRC;
2149                 spec_dst = daddr;
2150                 goto local_input;
2151         }
2152
2153         if (!IN_DEV_FORWARD(in_dev))
2154                 goto e_hostunreach;
2155         if (res.type != RTN_UNICAST)
2156                 goto martian_destination;
2157
2158         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2159 out:    return err;
2160
2161 brd_input:
2162         if (skb->protocol != htons(ETH_P_IP))
2163                 goto e_inval;
2164
2165         if (ipv4_is_zeronet(saddr))
2166                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2167         else {
2168                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2169                                           &itag);
2170                 if (err < 0)
2171                         goto martian_source_keep_err;
2172                 if (err)
2173                         flags |= RTCF_DIRECTSRC;
2174         }
2175         flags |= RTCF_BROADCAST;
2176         res.type = RTN_BROADCAST;
2177         RT_CACHE_STAT_INC(in_brd);
2178
2179 local_input:
2180         rth = rt_dst_alloc(net->loopback_dev,
2181                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2182         if (!rth)
2183                 goto e_nobufs;
2184
2185         rth->dst.input= ip_local_deliver;
2186         rth->dst.output= ip_rt_bug;
2187 #ifdef CONFIG_IP_ROUTE_CLASSID
2188         rth->dst.tclassid = itag;
2189 #endif
2190
2191         rth->rt_key_dst = daddr;
2192         rth->rt_key_src = saddr;
2193         rth->rt_genid = rt_genid(net);
2194         rth->rt_flags   = flags|RTCF_LOCAL;
2195         rth->rt_type    = res.type;
2196         rth->rt_key_tos = tos;
2197         rth->rt_dst     = daddr;
2198         rth->rt_src     = saddr;
2199 #ifdef CONFIG_IP_ROUTE_CLASSID
2200         rth->dst.tclassid = itag;
2201 #endif
2202         rth->rt_route_iif = dev->ifindex;
2203         rth->rt_iif     = dev->ifindex;
2204         rth->rt_oif     = 0;
2205         rth->rt_mark    = skb->mark;
2206         rth->rt_gateway = daddr;
2207         rth->rt_spec_dst= spec_dst;
2208         rth->rt_peer_genid = 0;
2209         rth->peer = NULL;
2210         rth->fi = NULL;
2211         if (res.type == RTN_UNREACHABLE) {
2212                 rth->dst.input= ip_error;
2213                 rth->dst.error= -err;
2214                 rth->rt_flags   &= ~RTCF_LOCAL;
2215         }
2216         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2217         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2218         err = 0;
2219         if (IS_ERR(rth))
2220                 err = PTR_ERR(rth);
2221         goto out;
2222
2223 no_route:
2224         RT_CACHE_STAT_INC(in_no_route);
2225         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226         res.type = RTN_UNREACHABLE;
2227         if (err == -ESRCH)
2228                 err = -ENETUNREACH;
2229         goto local_input;
2230
2231         /*
2232          *      Do not cache martian addresses: they should be logged (RFC1812)
2233          */
2234 martian_destination:
2235         RT_CACHE_STAT_INC(in_martian_dst);
2236 #ifdef CONFIG_IP_ROUTE_VERBOSE
2237         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2238                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239                         &daddr, &saddr, dev->name);
2240 #endif
2241
2242 e_hostunreach:
2243         err = -EHOSTUNREACH;
2244         goto out;
2245
2246 e_inval:
2247         err = -EINVAL;
2248         goto out;
2249
2250 e_nobufs:
2251         err = -ENOBUFS;
2252         goto out;
2253
2254 martian_source:
2255         err = -EINVAL;
2256 martian_source_keep_err:
2257         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2258         goto out;
2259 }
2260
2261 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2262                            u8 tos, struct net_device *dev, bool noref)
2263 {
2264         struct rtable * rth;
2265         unsigned        hash;
2266         int iif = dev->ifindex;
2267         struct net *net;
2268         int res;
2269
2270         net = dev_net(dev);
2271
2272         rcu_read_lock();
2273
2274         if (!rt_caching(net))
2275                 goto skip_cache;
2276
2277         tos &= IPTOS_RT_MASK;
2278         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2279
2280         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2281              rth = rcu_dereference(rth->dst.rt_next)) {
2282                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2283                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2284                      (rth->rt_route_iif ^ iif) |
2285                      (rth->rt_key_tos ^ tos)) == 0 &&
2286                     rth->rt_mark == skb->mark &&
2287                     net_eq(dev_net(rth->dst.dev), net) &&
2288                     !rt_is_expired(rth)) {
2289                         if (noref) {
2290                                 dst_use_noref(&rth->dst, jiffies);
2291                                 skb_dst_set_noref(skb, &rth->dst);
2292                         } else {
2293                                 dst_use(&rth->dst, jiffies);
2294                                 skb_dst_set(skb, &rth->dst);
2295                         }
2296                         RT_CACHE_STAT_INC(in_hit);
2297                         rcu_read_unlock();
2298                         return 0;
2299                 }
2300                 RT_CACHE_STAT_INC(in_hlist_search);
2301         }
2302
2303 skip_cache:
2304         /* Multicast recognition logic is moved from route cache to here.
2305            The problem was that too many Ethernet cards have broken/missing
2306            hardware multicast filters :-( As result the host on multicasting
2307            network acquires a lot of useless route cache entries, sort of
2308            SDR messages from all the world. Now we try to get rid of them.
2309            Really, provided software IP multicast filter is organized
2310            reasonably (at least, hashed), it does not result in a slowdown
2311            comparing with route cache reject entries.
2312            Note, that multicast routers are not affected, because
2313            route cache entry is created eventually.
2314          */
2315         if (ipv4_is_multicast(daddr)) {
2316                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2317
2318                 if (in_dev) {
2319                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2320                                                   ip_hdr(skb)->protocol);
2321                         if (our
2322 #ifdef CONFIG_IP_MROUTE
2323                                 ||
2324                             (!ipv4_is_local_multicast(daddr) &&
2325                              IN_DEV_MFORWARD(in_dev))
2326 #endif
2327                            ) {
2328                                 int res = ip_route_input_mc(skb, daddr, saddr,
2329                                                             tos, dev, our);
2330                                 rcu_read_unlock();
2331                                 return res;
2332                         }
2333                 }
2334                 rcu_read_unlock();
2335                 return -EINVAL;
2336         }
2337         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2338         rcu_read_unlock();
2339         return res;
2340 }
2341 EXPORT_SYMBOL(ip_route_input_common);
2342
2343 /* called with rcu_read_lock() */
2344 static struct rtable *__mkroute_output(const struct fib_result *res,
2345                                        const struct flowi4 *fl4,
2346                                        __be32 orig_daddr, __be32 orig_saddr,
2347                                        int orig_oif, struct net_device *dev_out,
2348                                        unsigned int flags)
2349 {
2350         struct fib_info *fi = res->fi;
2351         u32 tos = RT_FL_TOS(fl4);
2352         struct in_device *in_dev;
2353         u16 type = res->type;
2354         struct rtable *rth;
2355
2356         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2357                 return ERR_PTR(-EINVAL);
2358
2359         if (ipv4_is_lbcast(fl4->daddr))
2360                 type = RTN_BROADCAST;
2361         else if (ipv4_is_multicast(fl4->daddr))
2362                 type = RTN_MULTICAST;
2363         else if (ipv4_is_zeronet(fl4->daddr))
2364                 return ERR_PTR(-EINVAL);
2365
2366         if (dev_out->flags & IFF_LOOPBACK)
2367                 flags |= RTCF_LOCAL;
2368
2369         in_dev = __in_dev_get_rcu(dev_out);
2370         if (!in_dev)
2371                 return ERR_PTR(-EINVAL);
2372
2373         if (type == RTN_BROADCAST) {
2374                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2375                 fi = NULL;
2376         } else if (type == RTN_MULTICAST) {
2377                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2378                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2379                                      fl4->flowi4_proto))
2380                         flags &= ~RTCF_LOCAL;
2381                 /* If multicast route do not exist use
2382                  * default one, but do not gateway in this case.
2383                  * Yes, it is hack.
2384                  */
2385                 if (fi && res->prefixlen < 4)
2386                         fi = NULL;
2387         }
2388
2389         rth = rt_dst_alloc(dev_out,
2390                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2391                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2392         if (!rth)
2393                 return ERR_PTR(-ENOBUFS);
2394
2395         rth->dst.output = ip_output;
2396
2397         rth->rt_key_dst = orig_daddr;
2398         rth->rt_key_src = orig_saddr;
2399         rth->rt_genid = rt_genid(dev_net(dev_out));
2400         rth->rt_flags   = flags;
2401         rth->rt_type    = type;
2402         rth->rt_key_tos = tos;
2403         rth->rt_dst     = fl4->daddr;
2404         rth->rt_src     = fl4->saddr;
2405         rth->rt_route_iif = 0;
2406         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2407         rth->rt_oif     = orig_oif;
2408         rth->rt_mark    = fl4->flowi4_mark;
2409         rth->rt_gateway = fl4->daddr;
2410         rth->rt_spec_dst= fl4->saddr;
2411         rth->rt_peer_genid = 0;
2412         rth->peer = NULL;
2413         rth->fi = NULL;
2414
2415         RT_CACHE_STAT_INC(out_slow_tot);
2416
2417         if (flags & RTCF_LOCAL) {
2418                 rth->dst.input = ip_local_deliver;
2419                 rth->rt_spec_dst = fl4->daddr;
2420         }
2421         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2422                 rth->rt_spec_dst = fl4->saddr;
2423                 if (flags & RTCF_LOCAL &&
2424                     !(dev_out->flags & IFF_LOOPBACK)) {
2425                         rth->dst.output = ip_mc_output;
2426                         RT_CACHE_STAT_INC(out_slow_mc);
2427                 }
2428 #ifdef CONFIG_IP_MROUTE
2429                 if (type == RTN_MULTICAST) {
2430                         if (IN_DEV_MFORWARD(in_dev) &&
2431                             !ipv4_is_local_multicast(fl4->daddr)) {
2432                                 rth->dst.input = ip_mr_input;
2433                                 rth->dst.output = ip_mc_output;
2434                         }
2435                 }
2436 #endif
2437         }
2438
2439         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2440
2441         return rth;
2442 }
2443
2444 /*
2445  * Major route resolver routine.
2446  * called with rcu_read_lock();
2447  */
2448
2449 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2450 {
2451         struct net_device *dev_out = NULL;
2452         u32 tos = RT_FL_TOS(fl4);
2453         unsigned int flags = 0;
2454         struct fib_result res;
2455         struct rtable *rth;
2456         __be32 orig_daddr;
2457         __be32 orig_saddr;
2458         int orig_oif;
2459
2460         res.fi          = NULL;
2461 #ifdef CONFIG_IP_MULTIPLE_TABLES
2462         res.r           = NULL;
2463 #endif
2464
2465         orig_daddr = fl4->daddr;
2466         orig_saddr = fl4->saddr;
2467         orig_oif = fl4->flowi4_oif;
2468
2469         fl4->flowi4_iif = net->loopback_dev->ifindex;
2470         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2471         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2472                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2473
2474         rcu_read_lock();
2475         if (fl4->saddr) {
2476                 rth = ERR_PTR(-EINVAL);
2477                 if (ipv4_is_multicast(fl4->saddr) ||
2478                     ipv4_is_lbcast(fl4->saddr) ||
2479                     ipv4_is_zeronet(fl4->saddr))
2480                         goto out;
2481
2482                 /* I removed check for oif == dev_out->oif here.
2483                    It was wrong for two reasons:
2484                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2485                       is assigned to multiple interfaces.
2486                    2. Moreover, we are allowed to send packets with saddr
2487                       of another iface. --ANK
2488                  */
2489
2490                 if (fl4->flowi4_oif == 0 &&
2491                     (ipv4_is_multicast(fl4->daddr) ||
2492                      ipv4_is_lbcast(fl4->daddr))) {
2493                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2494                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2495                         if (dev_out == NULL)
2496                                 goto out;
2497
2498                         /* Special hack: user can direct multicasts
2499                            and limited broadcast via necessary interface
2500                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2501                            This hack is not just for fun, it allows
2502                            vic,vat and friends to work.
2503                            They bind socket to loopback, set ttl to zero
2504                            and expect that it will work.
2505                            From the viewpoint of routing cache they are broken,
2506                            because we are not allowed to build multicast path
2507                            with loopback source addr (look, routing cache
2508                            cannot know, that ttl is zero, so that packet
2509                            will not leave this host and route is valid).
2510                            Luckily, this hack is good workaround.
2511                          */
2512
2513                         fl4->flowi4_oif = dev_out->ifindex;
2514                         goto make_route;
2515                 }
2516
2517                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2518                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2519                         if (!__ip_dev_find(net, fl4->saddr, false))
2520                                 goto out;
2521                 }
2522         }
2523
2524
2525         if (fl4->flowi4_oif) {
2526                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2527                 rth = ERR_PTR(-ENODEV);
2528                 if (dev_out == NULL)
2529                         goto out;
2530
2531                 /* RACE: Check return value of inet_select_addr instead. */
2532                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2533                         rth = ERR_PTR(-ENETUNREACH);
2534                         goto out;
2535                 }
2536                 if (ipv4_is_local_multicast(fl4->daddr) ||
2537                     ipv4_is_lbcast(fl4->daddr)) {
2538                         if (!fl4->saddr)
2539                                 fl4->saddr = inet_select_addr(dev_out, 0,
2540                                                               RT_SCOPE_LINK);
2541                         goto make_route;
2542                 }
2543                 if (fl4->saddr) {
2544                         if (ipv4_is_multicast(fl4->daddr))
2545                                 fl4->saddr = inet_select_addr(dev_out, 0,
2546                                                               fl4->flowi4_scope);
2547                         else if (!fl4->daddr)
2548                                 fl4->saddr = inet_select_addr(dev_out, 0,
2549                                                               RT_SCOPE_HOST);
2550                 }
2551         }
2552
2553         if (!fl4->daddr) {
2554                 fl4->daddr = fl4->saddr;
2555                 if (!fl4->daddr)
2556                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2557                 dev_out = net->loopback_dev;
2558                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2559                 res.type = RTN_LOCAL;
2560                 flags |= RTCF_LOCAL;
2561                 goto make_route;
2562         }
2563
2564         if (fib_lookup(net, fl4, &res)) {
2565                 res.fi = NULL;
2566                 if (fl4->flowi4_oif) {
2567                         /* Apparently, routing tables are wrong. Assume,
2568                            that the destination is on link.
2569
2570                            WHY? DW.
2571                            Because we are allowed to send to iface
2572                            even if it has NO routes and NO assigned
2573                            addresses. When oif is specified, routing
2574                            tables are looked up with only one purpose:
2575                            to catch if destination is gatewayed, rather than
2576                            direct. Moreover, if MSG_DONTROUTE is set,
2577                            we send packet, ignoring both routing tables
2578                            and ifaddr state. --ANK
2579
2580
2581                            We could make it even if oif is unknown,
2582                            likely IPv6, but we do not.
2583                          */
2584
2585                         if (fl4->saddr == 0)
2586                                 fl4->saddr = inet_select_addr(dev_out, 0,
2587                                                               RT_SCOPE_LINK);
2588                         res.type = RTN_UNICAST;
2589                         goto make_route;
2590                 }
2591                 rth = ERR_PTR(-ENETUNREACH);
2592                 goto out;
2593         }
2594
2595         if (res.type == RTN_LOCAL) {
2596                 if (!fl4->saddr) {
2597                         if (res.fi->fib_prefsrc)
2598                                 fl4->saddr = res.fi->fib_prefsrc;
2599                         else
2600                                 fl4->saddr = fl4->daddr;
2601                 }
2602                 dev_out = net->loopback_dev;
2603                 fl4->flowi4_oif = dev_out->ifindex;
2604                 res.fi = NULL;
2605                 flags |= RTCF_LOCAL;
2606                 goto make_route;
2607         }
2608
2609 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2610         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2611                 fib_select_multipath(&res);
2612         else
2613 #endif
2614         if (!res.prefixlen &&
2615             res.table->tb_num_default > 1 &&
2616             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2617                 fib_select_default(&res);
2618
2619         if (!fl4->saddr)
2620                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2621
2622         dev_out = FIB_RES_DEV(res);
2623         fl4->flowi4_oif = dev_out->ifindex;
2624
2625
2626 make_route:
2627         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2628                                dev_out, flags);
2629         if (!IS_ERR(rth)) {
2630                 unsigned int hash;
2631
2632                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2633                                rt_genid(dev_net(dev_out)));
2634                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2635         }
2636
2637 out:
2638         rcu_read_unlock();
2639         return rth;
2640 }
2641
2642 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2643 {
2644         struct rtable *rth;
2645         unsigned int hash;
2646
2647         if (!rt_caching(net))
2648                 goto slow_output;
2649
2650         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2651
2652         rcu_read_lock_bh();
2653         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2654                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2655                 if (rth->rt_key_dst == flp4->daddr &&
2656                     rth->rt_key_src == flp4->saddr &&
2657                     rt_is_output_route(rth) &&
2658                     rth->rt_oif == flp4->flowi4_oif &&
2659                     rth->rt_mark == flp4->flowi4_mark &&
2660                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2661                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2662                     net_eq(dev_net(rth->dst.dev), net) &&
2663                     !rt_is_expired(rth)) {
2664                         dst_use(&rth->dst, jiffies);
2665                         RT_CACHE_STAT_INC(out_hit);
2666                         rcu_read_unlock_bh();
2667                         if (!flp4->saddr)
2668                                 flp4->saddr = rth->rt_src;
2669                         if (!flp4->daddr)
2670                                 flp4->daddr = rth->rt_dst;
2671                         return rth;
2672                 }
2673                 RT_CACHE_STAT_INC(out_hlist_search);
2674         }
2675         rcu_read_unlock_bh();
2676
2677 slow_output:
2678         return ip_route_output_slow(net, flp4);
2679 }
2680 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2681
2682 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2683 {
2684         return NULL;
2685 }
2686
2687 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2688 {
2689         return 0;
2690 }
2691
2692 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2693 {
2694 }
2695
2696 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2697                                           unsigned long old)
2698 {
2699         return NULL;
2700 }
2701
2702 static struct dst_ops ipv4_dst_blackhole_ops = {
2703         .family                 =       AF_INET,
2704         .protocol               =       cpu_to_be16(ETH_P_IP),
2705         .destroy                =       ipv4_dst_destroy,
2706         .check                  =       ipv4_blackhole_dst_check,
2707         .default_mtu            =       ipv4_blackhole_default_mtu,
2708         .default_advmss         =       ipv4_default_advmss,
2709         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2710         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2711 };
2712
2713 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2714 {
2715         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2716         struct rtable *ort = (struct rtable *) dst_orig;
2717
2718         if (rt) {
2719                 struct dst_entry *new = &rt->dst;
2720
2721                 new->__use = 1;
2722                 new->input = dst_discard;
2723                 new->output = dst_discard;
2724                 dst_copy_metrics(new, &ort->dst);
2725
2726                 new->dev = ort->dst.dev;
2727                 if (new->dev)
2728                         dev_hold(new->dev);
2729
2730                 rt->rt_key_dst = ort->rt_key_dst;
2731                 rt->rt_key_src = ort->rt_key_src;
2732                 rt->rt_key_tos = ort->rt_key_tos;
2733                 rt->rt_route_iif = ort->rt_route_iif;
2734                 rt->rt_iif = ort->rt_iif;
2735                 rt->rt_oif = ort->rt_oif;
2736                 rt->rt_mark = ort->rt_mark;
2737
2738                 rt->rt_genid = rt_genid(net);
2739                 rt->rt_flags = ort->rt_flags;
2740                 rt->rt_type = ort->rt_type;
2741                 rt->rt_dst = ort->rt_dst;
2742                 rt->rt_src = ort->rt_src;
2743                 rt->rt_gateway = ort->rt_gateway;
2744                 rt->rt_spec_dst = ort->rt_spec_dst;
2745                 rt->peer = ort->peer;
2746                 if (rt->peer)
2747                         atomic_inc(&rt->peer->refcnt);
2748                 rt->fi = ort->fi;
2749                 if (rt->fi)
2750                         atomic_inc(&rt->fi->fib_clntref);
2751
2752                 dst_free(new);
2753         }
2754
2755         dst_release(dst_orig);
2756
2757         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2758 }
2759
2760 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2761                                     struct sock *sk)
2762 {
2763         struct rtable *rt = __ip_route_output_key(net, flp4);
2764
2765         if (IS_ERR(rt))
2766                 return rt;
2767
2768         if (flp4->flowi4_proto)
2769                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2770                                                    flowi4_to_flowi(flp4),
2771                                                    sk, 0);
2772
2773         return rt;
2774 }
2775 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776
2777 static int rt_fill_info(struct net *net,
2778                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2779                         int nowait, unsigned int flags)
2780 {
2781         struct rtable *rt = skb_rtable(skb);
2782         struct rtmsg *r;
2783         struct nlmsghdr *nlh;
2784         long expires = 0;
2785         const struct inet_peer *peer = rt->peer;
2786         u32 id = 0, ts = 0, tsage = 0, error;
2787
2788         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2789         if (nlh == NULL)
2790                 return -EMSGSIZE;
2791
2792         r = nlmsg_data(nlh);
2793         r->rtm_family    = AF_INET;
2794         r->rtm_dst_len  = 32;
2795         r->rtm_src_len  = 0;
2796         r->rtm_tos      = rt->rt_key_tos;
2797         r->rtm_table    = RT_TABLE_MAIN;
2798         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2799         r->rtm_type     = rt->rt_type;
2800         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2801         r->rtm_protocol = RTPROT_UNSPEC;
2802         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2803         if (rt->rt_flags & RTCF_NOTIFY)
2804                 r->rtm_flags |= RTM_F_NOTIFY;
2805
2806         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2807
2808         if (rt->rt_key_src) {
2809                 r->rtm_src_len = 32;
2810                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2811         }
2812         if (rt->dst.dev)
2813                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2814 #ifdef CONFIG_IP_ROUTE_CLASSID
2815         if (rt->dst.tclassid)
2816                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2817 #endif
2818         if (rt_is_input_route(rt))
2819                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2820         else if (rt->rt_src != rt->rt_key_src)
2821                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2822
2823         if (rt->rt_dst != rt->rt_gateway)
2824                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2825
2826         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2827                 goto nla_put_failure;
2828
2829         if (rt->rt_mark)
2830                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2831
2832         error = rt->dst.error;
2833         if (peer) {
2834                 inet_peer_refcheck(rt->peer);
2835                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2836                 if (peer->tcp_ts_stamp) {
2837                         ts = peer->tcp_ts;
2838                         tsage = get_seconds() - peer->tcp_ts_stamp;
2839                 }
2840                 expires = ACCESS_ONCE(peer->pmtu_expires);
2841                 if (expires)
2842                         expires -= jiffies;
2843         }
2844
2845         if (rt_is_input_route(rt)) {
2846 #ifdef CONFIG_IP_MROUTE
2847                 __be32 dst = rt->rt_dst;
2848
2849                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2850                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2851                         int err = ipmr_get_route(net, skb,
2852                                                  rt->rt_src, rt->rt_dst,
2853                                                  r, nowait);
2854                         if (err <= 0) {
2855                                 if (!nowait) {
2856                                         if (err == 0)
2857                                                 return 0;
2858                                         goto nla_put_failure;
2859                                 } else {
2860                                         if (err == -EMSGSIZE)
2861                                                 goto nla_put_failure;
2862                                         error = err;
2863                                 }
2864                         }
2865                 } else
2866 #endif
2867                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2868         }
2869
2870         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2871                                expires, error) < 0)
2872                 goto nla_put_failure;
2873
2874         return nlmsg_end(skb, nlh);
2875
2876 nla_put_failure:
2877         nlmsg_cancel(skb, nlh);
2878         return -EMSGSIZE;
2879 }
2880
2881 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2882 {
2883         struct net *net = sock_net(in_skb->sk);
2884         struct rtmsg *rtm;
2885         struct nlattr *tb[RTA_MAX+1];
2886         struct rtable *rt = NULL;
2887         __be32 dst = 0;
2888         __be32 src = 0;
2889         u32 iif;
2890         int err;
2891         int mark;
2892         struct sk_buff *skb;
2893
2894         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2895         if (err < 0)
2896                 goto errout;
2897
2898         rtm = nlmsg_data(nlh);
2899
2900         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2901         if (skb == NULL) {
2902                 err = -ENOBUFS;
2903                 goto errout;
2904         }
2905
2906         /* Reserve room for dummy headers, this skb can pass
2907            through good chunk of routing engine.
2908          */
2909         skb_reset_mac_header(skb);
2910         skb_reset_network_header(skb);
2911
2912         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2913         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2914         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2915
2916         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2917         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2918         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2919         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2920
2921         if (iif) {
2922                 struct net_device *dev;
2923
2924                 dev = __dev_get_by_index(net, iif);
2925                 if (dev == NULL) {
2926                         err = -ENODEV;
2927                         goto errout_free;
2928                 }
2929
2930                 skb->protocol   = htons(ETH_P_IP);
2931                 skb->dev        = dev;
2932                 skb->mark       = mark;
2933                 local_bh_disable();
2934                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2935                 local_bh_enable();
2936
2937                 rt = skb_rtable(skb);
2938                 if (err == 0 && rt->dst.error)
2939                         err = -rt->dst.error;
2940         } else {
2941                 struct flowi4 fl4 = {
2942                         .daddr = dst,
2943                         .saddr = src,
2944                         .flowi4_tos = rtm->rtm_tos,
2945                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2946                         .flowi4_mark = mark,
2947                 };
2948                 rt = ip_route_output_key(net, &fl4);
2949
2950                 err = 0;
2951                 if (IS_ERR(rt))
2952                         err = PTR_ERR(rt);
2953         }
2954
2955         if (err)
2956                 goto errout_free;
2957
2958         skb_dst_set(skb, &rt->dst);
2959         if (rtm->rtm_flags & RTM_F_NOTIFY)
2960                 rt->rt_flags |= RTCF_NOTIFY;
2961
2962         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2963                            RTM_NEWROUTE, 0, 0);
2964         if (err <= 0)
2965                 goto errout_free;
2966
2967         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2968 errout:
2969         return err;
2970
2971 errout_free:
2972         kfree_skb(skb);
2973         goto errout;
2974 }
2975
2976 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2977 {
2978         struct rtable *rt;
2979         int h, s_h;
2980         int idx, s_idx;
2981         struct net *net;
2982
2983         net = sock_net(skb->sk);
2984
2985         s_h = cb->args[0];
2986         if (s_h < 0)
2987                 s_h = 0;
2988         s_idx = idx = cb->args[1];
2989         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2990                 if (!rt_hash_table[h].chain)
2991                         continue;
2992                 rcu_read_lock_bh();
2993                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2994                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2995                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2996                                 continue;
2997                         if (rt_is_expired(rt))
2998                                 continue;
2999                         skb_dst_set_noref(skb, &rt->dst);
3000                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3001                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3002                                          1, NLM_F_MULTI) <= 0) {
3003                                 skb_dst_drop(skb);
3004                                 rcu_read_unlock_bh();
3005                                 goto done;
3006                         }
3007                         skb_dst_drop(skb);
3008                 }
3009                 rcu_read_unlock_bh();
3010         }
3011
3012 done:
3013         cb->args[0] = h;
3014         cb->args[1] = idx;
3015         return skb->len;
3016 }
3017
3018 void ip_rt_multicast_event(struct in_device *in_dev)
3019 {
3020         rt_cache_flush(dev_net(in_dev->dev), 0);
3021 }
3022
3023 #ifdef CONFIG_SYSCTL
3024 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3025                                         void __user *buffer,
3026                                         size_t *lenp, loff_t *ppos)
3027 {
3028         if (write) {
3029                 int flush_delay;
3030                 ctl_table ctl;
3031                 struct net *net;
3032
3033                 memcpy(&ctl, __ctl, sizeof(ctl));
3034                 ctl.data = &flush_delay;
3035                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3036
3037                 net = (struct net *)__ctl->extra1;
3038                 rt_cache_flush(net, flush_delay);
3039                 return 0;
3040         }
3041
3042         return -EINVAL;
3043 }
3044
3045 static ctl_table ipv4_route_table[] = {
3046         {
3047                 .procname       = "gc_thresh",
3048                 .data           = &ipv4_dst_ops.gc_thresh,
3049                 .maxlen         = sizeof(int),
3050                 .mode           = 0644,
3051                 .proc_handler   = proc_dointvec,
3052         },
3053         {
3054                 .procname       = "max_size",
3055                 .data           = &ip_rt_max_size,
3056                 .maxlen         = sizeof(int),
3057                 .mode           = 0644,
3058                 .proc_handler   = proc_dointvec,
3059         },
3060         {
3061                 /*  Deprecated. Use gc_min_interval_ms */
3062
3063                 .procname       = "gc_min_interval",
3064                 .data           = &ip_rt_gc_min_interval,
3065                 .maxlen         = sizeof(int),
3066                 .mode           = 0644,
3067                 .proc_handler   = proc_dointvec_jiffies,
3068         },
3069         {
3070                 .procname       = "gc_min_interval_ms",
3071                 .data           = &ip_rt_gc_min_interval,
3072                 .maxlen         = sizeof(int),
3073                 .mode           = 0644,
3074                 .proc_handler   = proc_dointvec_ms_jiffies,
3075         },
3076         {
3077                 .procname       = "gc_timeout",
3078                 .data           = &ip_rt_gc_timeout,
3079                 .maxlen         = sizeof(int),
3080                 .mode           = 0644,
3081                 .proc_handler   = proc_dointvec_jiffies,
3082         },
3083         {
3084                 .procname       = "gc_interval",
3085                 .data           = &ip_rt_gc_interval,
3086                 .maxlen         = sizeof(int),
3087                 .mode           = 0644,
3088                 .proc_handler   = proc_dointvec_jiffies,
3089         },
3090         {
3091                 .procname       = "redirect_load",
3092                 .data           = &ip_rt_redirect_load,
3093                 .maxlen         = sizeof(int),
3094                 .mode           = 0644,
3095                 .proc_handler   = proc_dointvec,
3096         },
3097         {
3098                 .procname       = "redirect_number",
3099                 .data           = &ip_rt_redirect_number,
3100                 .maxlen         = sizeof(int),
3101                 .mode           = 0644,
3102                 .proc_handler   = proc_dointvec,
3103         },
3104         {
3105                 .procname       = "redirect_silence",
3106                 .data           = &ip_rt_redirect_silence,
3107                 .maxlen         = sizeof(int),
3108                 .mode           = 0644,
3109                 .proc_handler   = proc_dointvec,
3110         },
3111         {
3112                 .procname       = "error_cost",
3113                 .data           = &ip_rt_error_cost,
3114                 .maxlen         = sizeof(int),
3115                 .mode           = 0644,
3116                 .proc_handler   = proc_dointvec,
3117         },
3118         {
3119                 .procname       = "error_burst",
3120                 .data           = &ip_rt_error_burst,
3121                 .maxlen         = sizeof(int),
3122                 .mode           = 0644,
3123                 .proc_handler   = proc_dointvec,
3124         },
3125         {
3126                 .procname       = "gc_elasticity",
3127                 .data           = &ip_rt_gc_elasticity,
3128                 .maxlen         = sizeof(int),
3129                 .mode           = 0644,
3130                 .proc_handler   = proc_dointvec,
3131         },
3132         {
3133                 .procname       = "mtu_expires",
3134                 .data           = &ip_rt_mtu_expires,
3135                 .maxlen         = sizeof(int),
3136                 .mode           = 0644,
3137                 .proc_handler   = proc_dointvec_jiffies,
3138         },
3139         {
3140                 .procname       = "min_pmtu",
3141                 .data           = &ip_rt_min_pmtu,
3142                 .maxlen         = sizeof(int),
3143                 .mode           = 0644,
3144                 .proc_handler   = proc_dointvec,
3145         },
3146         {
3147                 .procname       = "min_adv_mss",
3148                 .data           = &ip_rt_min_advmss,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = proc_dointvec,
3152         },
3153         { }
3154 };
3155
3156 static struct ctl_table empty[1];
3157
3158 static struct ctl_table ipv4_skeleton[] =
3159 {
3160         { .procname = "route", 
3161           .mode = 0555, .child = ipv4_route_table},
3162         { .procname = "neigh", 
3163           .mode = 0555, .child = empty},
3164         { }
3165 };
3166
3167 static __net_initdata struct ctl_path ipv4_path[] = {
3168         { .procname = "net", },
3169         { .procname = "ipv4", },
3170         { },
3171 };
3172
3173 static struct ctl_table ipv4_route_flush_table[] = {
3174         {
3175                 .procname       = "flush",
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0200,
3178                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3179         },
3180         { },
3181 };
3182
3183 static __net_initdata struct ctl_path ipv4_route_path[] = {
3184         { .procname = "net", },
3185         { .procname = "ipv4", },
3186         { .procname = "route", },
3187         { },
3188 };
3189
3190 static __net_init int sysctl_route_net_init(struct net *net)
3191 {
3192         struct ctl_table *tbl;
3193
3194         tbl = ipv4_route_flush_table;
3195         if (!net_eq(net, &init_net)) {
3196                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3197                 if (tbl == NULL)
3198                         goto err_dup;
3199         }
3200         tbl[0].extra1 = net;
3201
3202         net->ipv4.route_hdr =
3203                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3204         if (net->ipv4.route_hdr == NULL)
3205                 goto err_reg;
3206         return 0;
3207
3208 err_reg:
3209         if (tbl != ipv4_route_flush_table)
3210                 kfree(tbl);
3211 err_dup:
3212         return -ENOMEM;
3213 }
3214
3215 static __net_exit void sysctl_route_net_exit(struct net *net)
3216 {
3217         struct ctl_table *tbl;
3218
3219         tbl = net->ipv4.route_hdr->ctl_table_arg;
3220         unregister_net_sysctl_table(net->ipv4.route_hdr);
3221         BUG_ON(tbl == ipv4_route_flush_table);
3222         kfree(tbl);
3223 }
3224
3225 static __net_initdata struct pernet_operations sysctl_route_ops = {
3226         .init = sysctl_route_net_init,
3227         .exit = sysctl_route_net_exit,
3228 };
3229 #endif
3230
3231 static __net_init int rt_genid_init(struct net *net)
3232 {
3233         get_random_bytes(&net->ipv4.rt_genid,
3234                          sizeof(net->ipv4.rt_genid));
3235         get_random_bytes(&net->ipv4.dev_addr_genid,
3236                          sizeof(net->ipv4.dev_addr_genid));
3237         return 0;
3238 }
3239
3240 static __net_initdata struct pernet_operations rt_genid_ops = {
3241         .init = rt_genid_init,
3242 };
3243
3244
3245 #ifdef CONFIG_IP_ROUTE_CLASSID
3246 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3247 #endif /* CONFIG_IP_ROUTE_CLASSID */
3248
3249 static __initdata unsigned long rhash_entries;
3250 static int __init set_rhash_entries(char *str)
3251 {
3252         if (!str)
3253                 return 0;
3254         rhash_entries = simple_strtoul(str, &str, 0);
3255         return 1;
3256 }
3257 __setup("rhash_entries=", set_rhash_entries);
3258
3259 int __init ip_rt_init(void)
3260 {
3261         int rc = 0;
3262
3263 #ifdef CONFIG_IP_ROUTE_CLASSID
3264         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3265         if (!ip_rt_acct)
3266                 panic("IP: failed to allocate ip_rt_acct\n");
3267 #endif
3268
3269         ipv4_dst_ops.kmem_cachep =
3270                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3271                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3272
3273         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3274
3275         if (dst_entries_init(&ipv4_dst_ops) < 0)
3276                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277
3278         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280
3281         rt_hash_table = (struct rt_hash_bucket *)
3282                 alloc_large_system_hash("IP route cache",
3283                                         sizeof(struct rt_hash_bucket),
3284                                         rhash_entries,
3285                                         (totalram_pages >= 128 * 1024) ?
3286                                         15 : 17,
3287                                         0,
3288                                         &rt_hash_log,
3289                                         &rt_hash_mask,
3290                                         rhash_entries ? 0 : 512 * 1024);
3291         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3292         rt_hash_lock_init();
3293
3294         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3295         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3296
3297         devinet_init();
3298         ip_fib_init();
3299
3300         if (ip_rt_proc_init())
3301                 printk(KERN_ERR "Unable to create route proc files\n");
3302 #ifdef CONFIG_XFRM
3303         xfrm_init();
3304         xfrm4_init(ip_rt_max_size);
3305 #endif
3306         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3307
3308 #ifdef CONFIG_SYSCTL
3309         register_pernet_subsys(&sysctl_route_ops);
3310 #endif
3311         register_pernet_subsys(&rt_genid_ops);
3312         return rc;
3313 }
3314
3315 #ifdef CONFIG_SYSCTL
3316 /*
3317  * We really need to sanitize the damn ipv4 init order, then all
3318  * this nonsense will go away.
3319  */
3320 void __init ip_static_sysctl_init(void)
3321 {
3322         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3323 }
3324 #endif