05a6a8ecb574a053c5c187a820a91968ea68a299
[platform/kernel/linux-rpi.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/nexthop.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/lwtunnel.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #include "fib_lookup.h"
118
119 #define RT_FL_TOS(oldflp4) \
120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122 #define RT_GC_TIMEOUT (300*HZ)
123
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly  = 9;
126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly       = HZ;
129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133
134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
135
136 /*
137  *      Interface to generic destination cache.
138  */
139
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146                                            struct sk_buff *skb, u32 mtu);
147 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148                                         struct sk_buff *skb);
149 static void             ipv4_dst_destroy(struct dst_entry *dst);
150
151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
152 {
153         WARN_ON(1);
154         return NULL;
155 }
156
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158                                            struct sk_buff *skb,
159                                            const void *daddr);
160 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
161
162 static struct dst_ops ipv4_dst_ops = {
163         .family =               AF_INET,
164         .check =                ipv4_dst_check,
165         .default_advmss =       ipv4_default_advmss,
166         .mtu =                  ipv4_mtu,
167         .cow_metrics =          ipv4_cow_metrics,
168         .destroy =              ipv4_dst_destroy,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175         .confirm_neigh =        ipv4_confirm_neigh,
176 };
177
178 #define ECN_OR_COST(class)      TC_PRIO_##class
179
180 const __u8 ip_tos2prio[16] = {
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK)
197 };
198 EXPORT_SYMBOL(ip_tos2prio);
199
200 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
201 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
202
203 #ifdef CONFIG_PROC_FS
204 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
205 {
206         if (*pos)
207                 return NULL;
208         return SEQ_START_TOKEN;
209 }
210
211 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
212 {
213         ++*pos;
214         return NULL;
215 }
216
217 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
218 {
219 }
220
221 static int rt_cache_seq_show(struct seq_file *seq, void *v)
222 {
223         if (v == SEQ_START_TOKEN)
224                 seq_printf(seq, "%-127s\n",
225                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
226                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
227                            "HHUptod\tSpecDst");
228         return 0;
229 }
230
231 static const struct seq_operations rt_cache_seq_ops = {
232         .start  = rt_cache_seq_start,
233         .next   = rt_cache_seq_next,
234         .stop   = rt_cache_seq_stop,
235         .show   = rt_cache_seq_show,
236 };
237
238 static int rt_cache_seq_open(struct inode *inode, struct file *file)
239 {
240         return seq_open(file, &rt_cache_seq_ops);
241 }
242
243 static const struct file_operations rt_cache_seq_fops = {
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    0, /* st->in_hit */
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    0, /* st->out_hit */
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    0, /* st->gc_total */
311                    0, /* st->gc_ignored */
312                    0, /* st->gc_goal_miss */
313                    0, /* st->gc_dst_overflow */
314                    0, /* st->in_hlist_search */
315                    0  /* st->out_hlist_search */
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364 #endif
365
366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368         struct proc_dir_entry *pde;
369
370         pde = proc_create("rt_cache", 0444, net->proc_net,
371                           &rt_cache_seq_fops);
372         if (!pde)
373                 goto err1;
374
375         pde = proc_create("rt_cache", 0444,
376                           net->proc_net_stat, &rt_cpu_seq_fops);
377         if (!pde)
378                 goto err2;
379
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381         pde = proc_create_single("rt_acct", 0, net->proc_net,
382                         rt_acct_proc_show);
383         if (!pde)
384                 goto err3;
385 #endif
386         return 0;
387
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390         remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393         remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395         return -ENOMEM;
396 }
397
398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400         remove_proc_entry("rt_cache", net->proc_net_stat);
401         remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403         remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408         .init = ip_rt_do_proc_init,
409         .exit = ip_rt_do_proc_exit,
410 };
411
412 static int __init ip_rt_proc_init(void)
413 {
414         return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416
417 #else
418 static inline int ip_rt_proc_init(void)
419 {
420         return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423
424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428
429 void rt_cache_flush(struct net *net)
430 {
431         rt_genid_bump_ipv4(net);
432 }
433
434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435                                            struct sk_buff *skb,
436                                            const void *daddr)
437 {
438         const struct rtable *rt = container_of(dst, struct rtable, dst);
439         struct net_device *dev = dst->dev;
440         struct neighbour *n;
441
442         rcu_read_lock_bh();
443
444         if (likely(rt->rt_gw_family == AF_INET)) {
445                 n = ip_neigh_gw4(dev, rt->rt_gw4);
446         } else if (rt->rt_gw_family == AF_INET6) {
447                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449                 __be32 pkey;
450
451                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452                 n = ip_neigh_gw4(dev, pkey);
453         }
454
455         if (n && !refcount_inc_not_zero(&n->refcnt))
456                 n = NULL;
457
458         rcu_read_unlock_bh();
459
460         return n;
461 }
462
463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465         const struct rtable *rt = container_of(dst, struct rtable, dst);
466         struct net_device *dev = dst->dev;
467         const __be32 *pkey = daddr;
468
469         if (rt->rt_gw_family == AF_INET) {
470                 pkey = (const __be32 *)&rt->rt_gw4;
471         } else if (rt->rt_gw_family == AF_INET6) {
472                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473         } else if (!daddr ||
474                  (rt->rt_flags &
475                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476                 return;
477         }
478         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480
481 #define IP_IDENTS_SZ 2048u
482
483 static atomic_t *ip_idents __read_mostly;
484 static u32 *ip_tstamps __read_mostly;
485
486 /* In order to protect privacy, we add a perturbation to identifiers
487  * if one generator is seldom used. This makes hard for an attacker
488  * to infer how many packets were sent between two points in time.
489  */
490 u32 ip_idents_reserve(u32 hash, int segs)
491 {
492         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
493         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
494         u32 old = READ_ONCE(*p_tstamp);
495         u32 now = (u32)jiffies;
496         u32 new, delta = 0;
497
498         if (old != now && cmpxchg(p_tstamp, old, now) == old)
499                 delta = prandom_u32_max(now - old);
500
501         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
502         do {
503                 old = (u32)atomic_read(p_id);
504                 new = old + delta + segs;
505         } while (atomic_cmpxchg(p_id, old, new) != old);
506
507         return new - segs;
508 }
509 EXPORT_SYMBOL(ip_idents_reserve);
510
511 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
512 {
513         u32 hash, id;
514
515         /* Note the following code is not safe, but this is okay. */
516         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
517                 get_random_bytes(&net->ipv4.ip_id_key,
518                                  sizeof(net->ipv4.ip_id_key));
519
520         hash = siphash_3u32((__force u32)iph->daddr,
521                             (__force u32)iph->saddr,
522                             iph->protocol,
523                             &net->ipv4.ip_id_key);
524         id = ip_idents_reserve(hash, segs);
525         iph->id = htons(id);
526 }
527 EXPORT_SYMBOL(__ip_select_ident);
528
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530                              const struct sock *sk,
531                              const struct iphdr *iph,
532                              int oif, u8 tos,
533                              u8 prot, u32 mark, int flow_flags)
534 {
535         if (sk) {
536                 const struct inet_sock *inet = inet_sk(sk);
537
538                 oif = sk->sk_bound_dev_if;
539                 mark = sk->sk_mark;
540                 tos = RT_CONN_FLAGS(sk);
541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542         }
543         flowi4_init_output(fl4, oif, mark, tos,
544                            RT_SCOPE_UNIVERSE, prot,
545                            flow_flags,
546                            iph->daddr, iph->saddr, 0, 0,
547                            sock_net_uid(net, sk));
548 }
549
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551                                const struct sock *sk)
552 {
553         const struct net *net = dev_net(skb->dev);
554         const struct iphdr *iph = ip_hdr(skb);
555         int oif = skb->dev->ifindex;
556         u8 tos = RT_TOS(iph->tos);
557         u8 prot = iph->protocol;
558         u32 mark = skb->mark;
559
560         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565         const struct inet_sock *inet = inet_sk(sk);
566         const struct ip_options_rcu *inet_opt;
567         __be32 daddr = inet->inet_daddr;
568
569         rcu_read_lock();
570         inet_opt = rcu_dereference(inet->inet_opt);
571         if (inet_opt && inet_opt->opt.srr)
572                 daddr = inet_opt->opt.faddr;
573         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576                            inet_sk_flowi_flags(sk),
577                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578         rcu_read_unlock();
579 }
580
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582                                  const struct sk_buff *skb)
583 {
584         if (skb)
585                 build_skb_flow_key(fl4, skb, sk);
586         else
587                 build_sk_flow_key(fl4, sk);
588 }
589
590 static DEFINE_SPINLOCK(fnhe_lock);
591
592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 {
594         struct rtable *rt;
595
596         rt = rcu_dereference(fnhe->fnhe_rth_input);
597         if (rt) {
598                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
599                 dst_dev_put(&rt->dst);
600                 dst_release(&rt->dst);
601         }
602         rt = rcu_dereference(fnhe->fnhe_rth_output);
603         if (rt) {
604                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
605                 dst_dev_put(&rt->dst);
606                 dst_release(&rt->dst);
607         }
608 }
609
610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
611 {
612         struct fib_nh_exception *fnhe, *oldest;
613
614         oldest = rcu_dereference(hash->chain);
615         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
616              fnhe = rcu_dereference(fnhe->fnhe_next)) {
617                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
618                         oldest = fnhe;
619         }
620         fnhe_flush_routes(oldest);
621         return oldest;
622 }
623
624 static inline u32 fnhe_hashfun(__be32 daddr)
625 {
626         static u32 fnhe_hashrnd __read_mostly;
627         u32 hval;
628
629         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
630         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
631         return hash_32(hval, FNHE_HASH_SHIFT);
632 }
633
634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
635 {
636         rt->rt_pmtu = fnhe->fnhe_pmtu;
637         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
638         rt->dst.expires = fnhe->fnhe_expires;
639
640         if (fnhe->fnhe_gw) {
641                 rt->rt_flags |= RTCF_REDIRECTED;
642                 rt->rt_gw_family = AF_INET;
643                 rt->rt_gw4 = fnhe->fnhe_gw;
644         }
645 }
646
647 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
648                                   __be32 gw, u32 pmtu, bool lock,
649                                   unsigned long expires)
650 {
651         struct fnhe_hash_bucket *hash;
652         struct fib_nh_exception *fnhe;
653         struct rtable *rt;
654         u32 genid, hval;
655         unsigned int i;
656         int depth;
657
658         genid = fnhe_genid(dev_net(nhc->nhc_dev));
659         hval = fnhe_hashfun(daddr);
660
661         spin_lock_bh(&fnhe_lock);
662
663         hash = rcu_dereference(nhc->nhc_exceptions);
664         if (!hash) {
665                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
666                 if (!hash)
667                         goto out_unlock;
668                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
669         }
670
671         hash += hval;
672
673         depth = 0;
674         for (fnhe = rcu_dereference(hash->chain); fnhe;
675              fnhe = rcu_dereference(fnhe->fnhe_next)) {
676                 if (fnhe->fnhe_daddr == daddr)
677                         break;
678                 depth++;
679         }
680
681         if (fnhe) {
682                 if (fnhe->fnhe_genid != genid)
683                         fnhe->fnhe_genid = genid;
684                 if (gw)
685                         fnhe->fnhe_gw = gw;
686                 if (pmtu) {
687                         fnhe->fnhe_pmtu = pmtu;
688                         fnhe->fnhe_mtu_locked = lock;
689                 }
690                 fnhe->fnhe_expires = max(1UL, expires);
691                 /* Update all cached dsts too */
692                 rt = rcu_dereference(fnhe->fnhe_rth_input);
693                 if (rt)
694                         fill_route_from_fnhe(rt, fnhe);
695                 rt = rcu_dereference(fnhe->fnhe_rth_output);
696                 if (rt)
697                         fill_route_from_fnhe(rt, fnhe);
698         } else {
699                 if (depth > FNHE_RECLAIM_DEPTH)
700                         fnhe = fnhe_oldest(hash);
701                 else {
702                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
703                         if (!fnhe)
704                                 goto out_unlock;
705
706                         fnhe->fnhe_next = hash->chain;
707                         rcu_assign_pointer(hash->chain, fnhe);
708                 }
709                 fnhe->fnhe_genid = genid;
710                 fnhe->fnhe_daddr = daddr;
711                 fnhe->fnhe_gw = gw;
712                 fnhe->fnhe_pmtu = pmtu;
713                 fnhe->fnhe_mtu_locked = lock;
714                 fnhe->fnhe_expires = max(1UL, expires);
715
716                 /* Exception created; mark the cached routes for the nexthop
717                  * stale, so anyone caching it rechecks if this exception
718                  * applies to them.
719                  */
720                 rt = rcu_dereference(nhc->nhc_rth_input);
721                 if (rt)
722                         rt->dst.obsolete = DST_OBSOLETE_KILL;
723
724                 for_each_possible_cpu(i) {
725                         struct rtable __rcu **prt;
726                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
727                         rt = rcu_dereference(*prt);
728                         if (rt)
729                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
730                 }
731         }
732
733         fnhe->fnhe_stamp = jiffies;
734
735 out_unlock:
736         spin_unlock_bh(&fnhe_lock);
737 }
738
739 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
740                              bool kill_route)
741 {
742         __be32 new_gw = icmp_hdr(skb)->un.gateway;
743         __be32 old_gw = ip_hdr(skb)->saddr;
744         struct net_device *dev = skb->dev;
745         struct in_device *in_dev;
746         struct fib_result res;
747         struct neighbour *n;
748         struct net *net;
749
750         switch (icmp_hdr(skb)->code & 7) {
751         case ICMP_REDIR_NET:
752         case ICMP_REDIR_NETTOS:
753         case ICMP_REDIR_HOST:
754         case ICMP_REDIR_HOSTTOS:
755                 break;
756
757         default:
758                 return;
759         }
760
761         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
762                 return;
763
764         in_dev = __in_dev_get_rcu(dev);
765         if (!in_dev)
766                 return;
767
768         net = dev_net(dev);
769         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
770             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
771             ipv4_is_zeronet(new_gw))
772                 goto reject_redirect;
773
774         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
775                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
776                         goto reject_redirect;
777                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
778                         goto reject_redirect;
779         } else {
780                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
781                         goto reject_redirect;
782         }
783
784         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
785         if (!n)
786                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
787         if (!IS_ERR(n)) {
788                 if (!(n->nud_state & NUD_VALID)) {
789                         neigh_event_send(n, NULL);
790                 } else {
791                         if (fib_lookup(net, fl4, &res, 0) == 0) {
792                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
793
794                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
795                                                 0, false,
796                                                 jiffies + ip_rt_gc_timeout);
797                         }
798                         if (kill_route)
799                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
800                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
801                 }
802                 neigh_release(n);
803         }
804         return;
805
806 reject_redirect:
807 #ifdef CONFIG_IP_ROUTE_VERBOSE
808         if (IN_DEV_LOG_MARTIANS(in_dev)) {
809                 const struct iphdr *iph = (const struct iphdr *) skb->data;
810                 __be32 daddr = iph->daddr;
811                 __be32 saddr = iph->saddr;
812
813                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
814                                      "  Advised path = %pI4 -> %pI4\n",
815                                      &old_gw, dev->name, &new_gw,
816                                      &saddr, &daddr);
817         }
818 #endif
819         ;
820 }
821
822 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
823 {
824         struct rtable *rt;
825         struct flowi4 fl4;
826         const struct iphdr *iph = (const struct iphdr *) skb->data;
827         struct net *net = dev_net(skb->dev);
828         int oif = skb->dev->ifindex;
829         u8 tos = RT_TOS(iph->tos);
830         u8 prot = iph->protocol;
831         u32 mark = skb->mark;
832
833         rt = (struct rtable *) dst;
834
835         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
836         __ip_do_redirect(rt, skb, &fl4, true);
837 }
838
839 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
840 {
841         struct rtable *rt = (struct rtable *)dst;
842         struct dst_entry *ret = dst;
843
844         if (rt) {
845                 if (dst->obsolete > 0) {
846                         ip_rt_put(rt);
847                         ret = NULL;
848                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
849                            rt->dst.expires) {
850                         ip_rt_put(rt);
851                         ret = NULL;
852                 }
853         }
854         return ret;
855 }
856
857 /*
858  * Algorithm:
859  *      1. The first ip_rt_redirect_number redirects are sent
860  *         with exponential backoff, then we stop sending them at all,
861  *         assuming that the host ignores our redirects.
862  *      2. If we did not see packets requiring redirects
863  *         during ip_rt_redirect_silence, we assume that the host
864  *         forgot redirected route and start to send redirects again.
865  *
866  * This algorithm is much cheaper and more intelligent than dumb load limiting
867  * in icmp.c.
868  *
869  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
870  * and "frag. need" (breaks PMTU discovery) in icmp.c.
871  */
872
873 void ip_rt_send_redirect(struct sk_buff *skb)
874 {
875         struct rtable *rt = skb_rtable(skb);
876         struct in_device *in_dev;
877         struct inet_peer *peer;
878         struct net *net;
879         int log_martians;
880         int vif;
881
882         rcu_read_lock();
883         in_dev = __in_dev_get_rcu(rt->dst.dev);
884         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
885                 rcu_read_unlock();
886                 return;
887         }
888         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
889         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
890         rcu_read_unlock();
891
892         net = dev_net(rt->dst.dev);
893         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
894         if (!peer) {
895                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
896                           rt_nexthop(rt, ip_hdr(skb)->daddr));
897                 return;
898         }
899
900         /* No redirected packets during ip_rt_redirect_silence;
901          * reset the algorithm.
902          */
903         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
904                 peer->rate_tokens = 0;
905                 peer->n_redirects = 0;
906         }
907
908         /* Too many ignored redirects; do not send anything
909          * set dst.rate_last to the last seen redirected packet.
910          */
911         if (peer->n_redirects >= ip_rt_redirect_number) {
912                 peer->rate_last = jiffies;
913                 goto out_put_peer;
914         }
915
916         /* Check for load limit; set rate_last to the latest sent
917          * redirect.
918          */
919         if (peer->rate_tokens == 0 ||
920             time_after(jiffies,
921                        (peer->rate_last +
922                         (ip_rt_redirect_load << peer->rate_tokens)))) {
923                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
924
925                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
926                 peer->rate_last = jiffies;
927                 ++peer->rate_tokens;
928                 ++peer->n_redirects;
929 #ifdef CONFIG_IP_ROUTE_VERBOSE
930                 if (log_martians &&
931                     peer->rate_tokens == ip_rt_redirect_number)
932                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933                                              &ip_hdr(skb)->saddr, inet_iif(skb),
934                                              &ip_hdr(skb)->daddr, &gw);
935 #endif
936         }
937 out_put_peer:
938         inet_putpeer(peer);
939 }
940
941 static int ip_error(struct sk_buff *skb)
942 {
943         struct rtable *rt = skb_rtable(skb);
944         struct net_device *dev = skb->dev;
945         struct in_device *in_dev;
946         struct inet_peer *peer;
947         unsigned long now;
948         struct net *net;
949         bool send;
950         int code;
951
952         if (netif_is_l3_master(skb->dev)) {
953                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
954                 if (!dev)
955                         goto out;
956         }
957
958         in_dev = __in_dev_get_rcu(dev);
959
960         /* IP on this device is disabled. */
961         if (!in_dev)
962                 goto out;
963
964         net = dev_net(rt->dst.dev);
965         if (!IN_DEV_FORWARD(in_dev)) {
966                 switch (rt->dst.error) {
967                 case EHOSTUNREACH:
968                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
969                         break;
970
971                 case ENETUNREACH:
972                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
973                         break;
974                 }
975                 goto out;
976         }
977
978         switch (rt->dst.error) {
979         case EINVAL:
980         default:
981                 goto out;
982         case EHOSTUNREACH:
983                 code = ICMP_HOST_UNREACH;
984                 break;
985         case ENETUNREACH:
986                 code = ICMP_NET_UNREACH;
987                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
988                 break;
989         case EACCES:
990                 code = ICMP_PKT_FILTERED;
991                 break;
992         }
993
994         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
995                                l3mdev_master_ifindex(skb->dev), 1);
996
997         send = true;
998         if (peer) {
999                 now = jiffies;
1000                 peer->rate_tokens += now - peer->rate_last;
1001                 if (peer->rate_tokens > ip_rt_error_burst)
1002                         peer->rate_tokens = ip_rt_error_burst;
1003                 peer->rate_last = now;
1004                 if (peer->rate_tokens >= ip_rt_error_cost)
1005                         peer->rate_tokens -= ip_rt_error_cost;
1006                 else
1007                         send = false;
1008                 inet_putpeer(peer);
1009         }
1010         if (send)
1011                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1012
1013 out:    kfree_skb(skb);
1014         return 0;
1015 }
1016
1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1018 {
1019         struct dst_entry *dst = &rt->dst;
1020         u32 old_mtu = ipv4_mtu(dst);
1021         struct fib_result res;
1022         bool lock = false;
1023
1024         if (ip_mtu_locked(dst))
1025                 return;
1026
1027         if (old_mtu < mtu)
1028                 return;
1029
1030         if (mtu < ip_rt_min_pmtu) {
1031                 lock = true;
1032                 mtu = min(old_mtu, ip_rt_min_pmtu);
1033         }
1034
1035         if (rt->rt_pmtu == mtu && !lock &&
1036             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1037                 return;
1038
1039         rcu_read_lock();
1040         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1041                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1042
1043                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1044                                       jiffies + ip_rt_mtu_expires);
1045         }
1046         rcu_read_unlock();
1047 }
1048
1049 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1050                               struct sk_buff *skb, u32 mtu)
1051 {
1052         struct rtable *rt = (struct rtable *) dst;
1053         struct flowi4 fl4;
1054
1055         ip_rt_build_flow_key(&fl4, sk, skb);
1056         __ip_rt_update_pmtu(rt, &fl4, mtu);
1057 }
1058
1059 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1060                       int oif, u8 protocol)
1061 {
1062         const struct iphdr *iph = (const struct iphdr *) skb->data;
1063         struct flowi4 fl4;
1064         struct rtable *rt;
1065         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1066
1067         __build_flow_key(net, &fl4, NULL, iph, oif,
1068                          RT_TOS(iph->tos), protocol, mark, 0);
1069         rt = __ip_route_output_key(net, &fl4);
1070         if (!IS_ERR(rt)) {
1071                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1072                 ip_rt_put(rt);
1073         }
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1076
1077 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1078 {
1079         const struct iphdr *iph = (const struct iphdr *) skb->data;
1080         struct flowi4 fl4;
1081         struct rtable *rt;
1082
1083         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1084
1085         if (!fl4.flowi4_mark)
1086                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1087
1088         rt = __ip_route_output_key(sock_net(sk), &fl4);
1089         if (!IS_ERR(rt)) {
1090                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1091                 ip_rt_put(rt);
1092         }
1093 }
1094
1095 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1096 {
1097         const struct iphdr *iph = (const struct iphdr *) skb->data;
1098         struct flowi4 fl4;
1099         struct rtable *rt;
1100         struct dst_entry *odst = NULL;
1101         bool new = false;
1102         struct net *net = sock_net(sk);
1103
1104         bh_lock_sock(sk);
1105
1106         if (!ip_sk_accept_pmtu(sk))
1107                 goto out;
1108
1109         odst = sk_dst_get(sk);
1110
1111         if (sock_owned_by_user(sk) || !odst) {
1112                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1113                 goto out;
1114         }
1115
1116         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1117
1118         rt = (struct rtable *)odst;
1119         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1120                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1121                 if (IS_ERR(rt))
1122                         goto out;
1123
1124                 new = true;
1125         }
1126
1127         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1128
1129         if (!dst_check(&rt->dst, 0)) {
1130                 if (new)
1131                         dst_release(&rt->dst);
1132
1133                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1134                 if (IS_ERR(rt))
1135                         goto out;
1136
1137                 new = true;
1138         }
1139
1140         if (new)
1141                 sk_dst_set(sk, &rt->dst);
1142
1143 out:
1144         bh_unlock_sock(sk);
1145         dst_release(odst);
1146 }
1147 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1148
1149 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1150                    int oif, u8 protocol)
1151 {
1152         const struct iphdr *iph = (const struct iphdr *) skb->data;
1153         struct flowi4 fl4;
1154         struct rtable *rt;
1155
1156         __build_flow_key(net, &fl4, NULL, iph, oif,
1157                          RT_TOS(iph->tos), protocol, 0, 0);
1158         rt = __ip_route_output_key(net, &fl4);
1159         if (!IS_ERR(rt)) {
1160                 __ip_do_redirect(rt, skb, &fl4, false);
1161                 ip_rt_put(rt);
1162         }
1163 }
1164 EXPORT_SYMBOL_GPL(ipv4_redirect);
1165
1166 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1167 {
1168         const struct iphdr *iph = (const struct iphdr *) skb->data;
1169         struct flowi4 fl4;
1170         struct rtable *rt;
1171         struct net *net = sock_net(sk);
1172
1173         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1174         rt = __ip_route_output_key(net, &fl4);
1175         if (!IS_ERR(rt)) {
1176                 __ip_do_redirect(rt, skb, &fl4, false);
1177                 ip_rt_put(rt);
1178         }
1179 }
1180 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1181
1182 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1183 {
1184         struct rtable *rt = (struct rtable *) dst;
1185
1186         /* All IPV4 dsts are created with ->obsolete set to the value
1187          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1188          * into this function always.
1189          *
1190          * When a PMTU/redirect information update invalidates a route,
1191          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1192          * DST_OBSOLETE_DEAD.
1193          */
1194         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1195                 return NULL;
1196         return dst;
1197 }
1198
1199 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1200 {
1201         struct ip_options opt;
1202         int res;
1203
1204         /* Recompile ip options since IPCB may not be valid anymore.
1205          * Also check we have a reasonable ipv4 header.
1206          */
1207         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1208             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1209                 return;
1210
1211         memset(&opt, 0, sizeof(opt));
1212         if (ip_hdr(skb)->ihl > 5) {
1213                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1214                         return;
1215                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1216
1217                 rcu_read_lock();
1218                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1219                 rcu_read_unlock();
1220
1221                 if (res)
1222                         return;
1223         }
1224         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1225 }
1226
1227 static void ipv4_link_failure(struct sk_buff *skb)
1228 {
1229         struct rtable *rt;
1230
1231         ipv4_send_dest_unreach(skb);
1232
1233         rt = skb_rtable(skb);
1234         if (rt)
1235                 dst_set_expires(&rt->dst, 0);
1236 }
1237
1238 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1239 {
1240         pr_debug("%s: %pI4 -> %pI4, %s\n",
1241                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1242                  skb->dev ? skb->dev->name : "?");
1243         kfree_skb(skb);
1244         WARN_ON(1);
1245         return 0;
1246 }
1247
1248 /*
1249    We do not cache source address of outgoing interface,
1250    because it is used only by IP RR, TS and SRR options,
1251    so that it out of fast path.
1252
1253    BTW remember: "addr" is allowed to be not aligned
1254    in IP options!
1255  */
1256
1257 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1258 {
1259         __be32 src;
1260
1261         if (rt_is_output_route(rt))
1262                 src = ip_hdr(skb)->saddr;
1263         else {
1264                 struct fib_result res;
1265                 struct iphdr *iph = ip_hdr(skb);
1266                 struct flowi4 fl4 = {
1267                         .daddr = iph->daddr,
1268                         .saddr = iph->saddr,
1269                         .flowi4_tos = RT_TOS(iph->tos),
1270                         .flowi4_oif = rt->dst.dev->ifindex,
1271                         .flowi4_iif = skb->dev->ifindex,
1272                         .flowi4_mark = skb->mark,
1273                 };
1274
1275                 rcu_read_lock();
1276                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1277                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1278                 else
1279                         src = inet_select_addr(rt->dst.dev,
1280                                                rt_nexthop(rt, iph->daddr),
1281                                                RT_SCOPE_UNIVERSE);
1282                 rcu_read_unlock();
1283         }
1284         memcpy(addr, &src, 4);
1285 }
1286
1287 #ifdef CONFIG_IP_ROUTE_CLASSID
1288 static void set_class_tag(struct rtable *rt, u32 tag)
1289 {
1290         if (!(rt->dst.tclassid & 0xFFFF))
1291                 rt->dst.tclassid |= tag & 0xFFFF;
1292         if (!(rt->dst.tclassid & 0xFFFF0000))
1293                 rt->dst.tclassid |= tag & 0xFFFF0000;
1294 }
1295 #endif
1296
1297 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1298 {
1299         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1300         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1301                                     ip_rt_min_advmss);
1302
1303         return min(advmss, IPV4_MAX_PMTU - header_size);
1304 }
1305
1306 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1307 {
1308         const struct rtable *rt = (const struct rtable *) dst;
1309         unsigned int mtu = rt->rt_pmtu;
1310
1311         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1312                 mtu = dst_metric_raw(dst, RTAX_MTU);
1313
1314         if (mtu)
1315                 return mtu;
1316
1317         mtu = READ_ONCE(dst->dev->mtu);
1318
1319         if (unlikely(ip_mtu_locked(dst))) {
1320                 if (rt->rt_gw_family && mtu > 576)
1321                         mtu = 576;
1322         }
1323
1324         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1325
1326         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1327 }
1328
1329 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1330 {
1331         struct fnhe_hash_bucket *hash;
1332         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1333         u32 hval = fnhe_hashfun(daddr);
1334
1335         spin_lock_bh(&fnhe_lock);
1336
1337         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1338                                          lockdep_is_held(&fnhe_lock));
1339         hash += hval;
1340
1341         fnhe_p = &hash->chain;
1342         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1343         while (fnhe) {
1344                 if (fnhe->fnhe_daddr == daddr) {
1345                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1346                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1347                         /* set fnhe_daddr to 0 to ensure it won't bind with
1348                          * new dsts in rt_bind_exception().
1349                          */
1350                         fnhe->fnhe_daddr = 0;
1351                         fnhe_flush_routes(fnhe);
1352                         kfree_rcu(fnhe, rcu);
1353                         break;
1354                 }
1355                 fnhe_p = &fnhe->fnhe_next;
1356                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1357                                                  lockdep_is_held(&fnhe_lock));
1358         }
1359
1360         spin_unlock_bh(&fnhe_lock);
1361 }
1362
1363 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1364                                                __be32 daddr)
1365 {
1366         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1367         struct fib_nh_exception *fnhe;
1368         u32 hval;
1369
1370         if (!hash)
1371                 return NULL;
1372
1373         hval = fnhe_hashfun(daddr);
1374
1375         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1376              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1377                 if (fnhe->fnhe_daddr == daddr) {
1378                         if (fnhe->fnhe_expires &&
1379                             time_after(jiffies, fnhe->fnhe_expires)) {
1380                                 ip_del_fnhe(nhc, daddr);
1381                                 break;
1382                         }
1383                         return fnhe;
1384                 }
1385         }
1386         return NULL;
1387 }
1388
1389 /* MTU selection:
1390  * 1. mtu on route is locked - use it
1391  * 2. mtu from nexthop exception
1392  * 3. mtu from egress device
1393  */
1394
1395 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1396 {
1397         struct fib_nh_common *nhc = res->nhc;
1398         struct net_device *dev = nhc->nhc_dev;
1399         struct fib_info *fi = res->fi;
1400         u32 mtu = 0;
1401
1402         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1403             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1404                 mtu = fi->fib_mtu;
1405
1406         if (likely(!mtu)) {
1407                 struct fib_nh_exception *fnhe;
1408
1409                 fnhe = find_exception(nhc, daddr);
1410                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1411                         mtu = fnhe->fnhe_pmtu;
1412         }
1413
1414         if (likely(!mtu))
1415                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1416
1417         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1418 }
1419
1420 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1421                               __be32 daddr, const bool do_cache)
1422 {
1423         bool ret = false;
1424
1425         spin_lock_bh(&fnhe_lock);
1426
1427         if (daddr == fnhe->fnhe_daddr) {
1428                 struct rtable __rcu **porig;
1429                 struct rtable *orig;
1430                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1431
1432                 if (rt_is_input_route(rt))
1433                         porig = &fnhe->fnhe_rth_input;
1434                 else
1435                         porig = &fnhe->fnhe_rth_output;
1436                 orig = rcu_dereference(*porig);
1437
1438                 if (fnhe->fnhe_genid != genid) {
1439                         fnhe->fnhe_genid = genid;
1440                         fnhe->fnhe_gw = 0;
1441                         fnhe->fnhe_pmtu = 0;
1442                         fnhe->fnhe_expires = 0;
1443                         fnhe->fnhe_mtu_locked = false;
1444                         fnhe_flush_routes(fnhe);
1445                         orig = NULL;
1446                 }
1447                 fill_route_from_fnhe(rt, fnhe);
1448                 if (!rt->rt_gw4) {
1449                         rt->rt_gw4 = daddr;
1450                         rt->rt_gw_family = AF_INET;
1451                 }
1452
1453                 if (do_cache) {
1454                         dst_hold(&rt->dst);
1455                         rcu_assign_pointer(*porig, rt);
1456                         if (orig) {
1457                                 dst_dev_put(&orig->dst);
1458                                 dst_release(&orig->dst);
1459                         }
1460                         ret = true;
1461                 }
1462
1463                 fnhe->fnhe_stamp = jiffies;
1464         }
1465         spin_unlock_bh(&fnhe_lock);
1466
1467         return ret;
1468 }
1469
1470 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1471 {
1472         struct rtable *orig, *prev, **p;
1473         bool ret = true;
1474
1475         if (rt_is_input_route(rt)) {
1476                 p = (struct rtable **)&nhc->nhc_rth_input;
1477         } else {
1478                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1479         }
1480         orig = *p;
1481
1482         /* hold dst before doing cmpxchg() to avoid race condition
1483          * on this dst
1484          */
1485         dst_hold(&rt->dst);
1486         prev = cmpxchg(p, orig, rt);
1487         if (prev == orig) {
1488                 if (orig) {
1489                         dst_dev_put(&orig->dst);
1490                         dst_release(&orig->dst);
1491                 }
1492         } else {
1493                 dst_release(&rt->dst);
1494                 ret = false;
1495         }
1496
1497         return ret;
1498 }
1499
1500 struct uncached_list {
1501         spinlock_t              lock;
1502         struct list_head        head;
1503 };
1504
1505 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1506
1507 void rt_add_uncached_list(struct rtable *rt)
1508 {
1509         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1510
1511         rt->rt_uncached_list = ul;
1512
1513         spin_lock_bh(&ul->lock);
1514         list_add_tail(&rt->rt_uncached, &ul->head);
1515         spin_unlock_bh(&ul->lock);
1516 }
1517
1518 void rt_del_uncached_list(struct rtable *rt)
1519 {
1520         if (!list_empty(&rt->rt_uncached)) {
1521                 struct uncached_list *ul = rt->rt_uncached_list;
1522
1523                 spin_lock_bh(&ul->lock);
1524                 list_del(&rt->rt_uncached);
1525                 spin_unlock_bh(&ul->lock);
1526         }
1527 }
1528
1529 static void ipv4_dst_destroy(struct dst_entry *dst)
1530 {
1531         struct rtable *rt = (struct rtable *)dst;
1532
1533         ip_dst_metrics_put(dst);
1534         rt_del_uncached_list(rt);
1535 }
1536
1537 void rt_flush_dev(struct net_device *dev)
1538 {
1539         struct net *net = dev_net(dev);
1540         struct rtable *rt;
1541         int cpu;
1542
1543         for_each_possible_cpu(cpu) {
1544                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1545
1546                 spin_lock_bh(&ul->lock);
1547                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1548                         if (rt->dst.dev != dev)
1549                                 continue;
1550                         rt->dst.dev = net->loopback_dev;
1551                         dev_hold(rt->dst.dev);
1552                         dev_put(dev);
1553                 }
1554                 spin_unlock_bh(&ul->lock);
1555         }
1556 }
1557
1558 static bool rt_cache_valid(const struct rtable *rt)
1559 {
1560         return  rt &&
1561                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1562                 !rt_is_expired(rt);
1563 }
1564
1565 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1566                            const struct fib_result *res,
1567                            struct fib_nh_exception *fnhe,
1568                            struct fib_info *fi, u16 type, u32 itag,
1569                            const bool do_cache)
1570 {
1571         bool cached = false;
1572
1573         if (fi) {
1574                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1575
1576                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1577                         rt->rt_gw_family = nhc->nhc_gw_family;
1578                         /* only INET and INET6 are supported */
1579                         if (likely(nhc->nhc_gw_family == AF_INET))
1580                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1581                         else
1582                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1583                 }
1584
1585                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1586
1587 #ifdef CONFIG_IP_ROUTE_CLASSID
1588                 {
1589                         struct fib_nh *nh;
1590
1591                         nh = container_of(nhc, struct fib_nh, nh_common);
1592                         rt->dst.tclassid = nh->nh_tclassid;
1593                 }
1594 #endif
1595                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1596                 if (unlikely(fnhe))
1597                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1598                 else if (do_cache)
1599                         cached = rt_cache_route(nhc, rt);
1600                 if (unlikely(!cached)) {
1601                         /* Routes we intend to cache in nexthop exception or
1602                          * FIB nexthop have the DST_NOCACHE bit clear.
1603                          * However, if we are unsuccessful at storing this
1604                          * route into the cache we really need to set it.
1605                          */
1606                         if (!rt->rt_gw4) {
1607                                 rt->rt_gw_family = AF_INET;
1608                                 rt->rt_gw4 = daddr;
1609                         }
1610                         rt_add_uncached_list(rt);
1611                 }
1612         } else
1613                 rt_add_uncached_list(rt);
1614
1615 #ifdef CONFIG_IP_ROUTE_CLASSID
1616 #ifdef CONFIG_IP_MULTIPLE_TABLES
1617         set_class_tag(rt, res->tclassid);
1618 #endif
1619         set_class_tag(rt, itag);
1620 #endif
1621 }
1622
1623 struct rtable *rt_dst_alloc(struct net_device *dev,
1624                             unsigned int flags, u16 type,
1625                             bool nopolicy, bool noxfrm, bool will_cache)
1626 {
1627         struct rtable *rt;
1628
1629         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1630                        (will_cache ? 0 : DST_HOST) |
1631                        (nopolicy ? DST_NOPOLICY : 0) |
1632                        (noxfrm ? DST_NOXFRM : 0));
1633
1634         if (rt) {
1635                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1636                 rt->rt_flags = flags;
1637                 rt->rt_type = type;
1638                 rt->rt_is_input = 0;
1639                 rt->rt_iif = 0;
1640                 rt->rt_pmtu = 0;
1641                 rt->rt_mtu_locked = 0;
1642                 rt->rt_gw_family = 0;
1643                 rt->rt_gw4 = 0;
1644                 INIT_LIST_HEAD(&rt->rt_uncached);
1645
1646                 rt->dst.output = ip_output;
1647                 if (flags & RTCF_LOCAL)
1648                         rt->dst.input = ip_local_deliver;
1649         }
1650
1651         return rt;
1652 }
1653 EXPORT_SYMBOL(rt_dst_alloc);
1654
1655 /* called in rcu_read_lock() section */
1656 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1657                           u8 tos, struct net_device *dev,
1658                           struct in_device *in_dev, u32 *itag)
1659 {
1660         int err;
1661
1662         /* Primary sanity checks. */
1663         if (!in_dev)
1664                 return -EINVAL;
1665
1666         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1667             skb->protocol != htons(ETH_P_IP))
1668                 return -EINVAL;
1669
1670         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1671                 return -EINVAL;
1672
1673         if (ipv4_is_zeronet(saddr)) {
1674                 if (!ipv4_is_local_multicast(daddr) &&
1675                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1676                         return -EINVAL;
1677         } else {
1678                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1679                                           in_dev, itag);
1680                 if (err < 0)
1681                         return err;
1682         }
1683         return 0;
1684 }
1685
1686 /* called in rcu_read_lock() section */
1687 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688                              u8 tos, struct net_device *dev, int our)
1689 {
1690         struct in_device *in_dev = __in_dev_get_rcu(dev);
1691         unsigned int flags = RTCF_MULTICAST;
1692         struct rtable *rth;
1693         u32 itag = 0;
1694         int err;
1695
1696         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1697         if (err)
1698                 return err;
1699
1700         if (our)
1701                 flags |= RTCF_LOCAL;
1702
1703         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1704                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1705         if (!rth)
1706                 return -ENOBUFS;
1707
1708 #ifdef CONFIG_IP_ROUTE_CLASSID
1709         rth->dst.tclassid = itag;
1710 #endif
1711         rth->dst.output = ip_rt_bug;
1712         rth->rt_is_input= 1;
1713
1714 #ifdef CONFIG_IP_MROUTE
1715         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1716                 rth->dst.input = ip_mr_input;
1717 #endif
1718         RT_CACHE_STAT_INC(in_slow_mc);
1719
1720         skb_dst_set(skb, &rth->dst);
1721         return 0;
1722 }
1723
1724
1725 static void ip_handle_martian_source(struct net_device *dev,
1726                                      struct in_device *in_dev,
1727                                      struct sk_buff *skb,
1728                                      __be32 daddr,
1729                                      __be32 saddr)
1730 {
1731         RT_CACHE_STAT_INC(in_martian_src);
1732 #ifdef CONFIG_IP_ROUTE_VERBOSE
1733         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1734                 /*
1735                  *      RFC1812 recommendation, if source is martian,
1736                  *      the only hint is MAC header.
1737                  */
1738                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1739                         &daddr, &saddr, dev->name);
1740                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1741                         print_hex_dump(KERN_WARNING, "ll header: ",
1742                                        DUMP_PREFIX_OFFSET, 16, 1,
1743                                        skb_mac_header(skb),
1744                                        dev->hard_header_len, false);
1745                 }
1746         }
1747 #endif
1748 }
1749
1750 /* called in rcu_read_lock() section */
1751 static int __mkroute_input(struct sk_buff *skb,
1752                            const struct fib_result *res,
1753                            struct in_device *in_dev,
1754                            __be32 daddr, __be32 saddr, u32 tos)
1755 {
1756         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1757         struct net_device *dev = nhc->nhc_dev;
1758         struct fib_nh_exception *fnhe;
1759         struct rtable *rth;
1760         int err;
1761         struct in_device *out_dev;
1762         bool do_cache;
1763         u32 itag = 0;
1764
1765         /* get a working reference to the output device */
1766         out_dev = __in_dev_get_rcu(dev);
1767         if (!out_dev) {
1768                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1769                 return -EINVAL;
1770         }
1771
1772         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1773                                   in_dev->dev, in_dev, &itag);
1774         if (err < 0) {
1775                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1776                                          saddr);
1777
1778                 goto cleanup;
1779         }
1780
1781         do_cache = res->fi && !itag;
1782         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1783             skb->protocol == htons(ETH_P_IP)) {
1784                 __be32 gw;
1785
1786                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1787                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1788                     inet_addr_onlink(out_dev, saddr, gw))
1789                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1790         }
1791
1792         if (skb->protocol != htons(ETH_P_IP)) {
1793                 /* Not IP (i.e. ARP). Do not create route, if it is
1794                  * invalid for proxy arp. DNAT routes are always valid.
1795                  *
1796                  * Proxy arp feature have been extended to allow, ARP
1797                  * replies back to the same interface, to support
1798                  * Private VLAN switch technologies. See arp.c.
1799                  */
1800                 if (out_dev == in_dev &&
1801                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1802                         err = -EINVAL;
1803                         goto cleanup;
1804                 }
1805         }
1806
1807         fnhe = find_exception(nhc, daddr);
1808         if (do_cache) {
1809                 if (fnhe)
1810                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1811                 else
1812                         rth = rcu_dereference(nhc->nhc_rth_input);
1813                 if (rt_cache_valid(rth)) {
1814                         skb_dst_set_noref(skb, &rth->dst);
1815                         goto out;
1816                 }
1817         }
1818
1819         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1820                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1821                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1822         if (!rth) {
1823                 err = -ENOBUFS;
1824                 goto cleanup;
1825         }
1826
1827         rth->rt_is_input = 1;
1828         RT_CACHE_STAT_INC(in_slow_tot);
1829
1830         rth->dst.input = ip_forward;
1831
1832         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1833                        do_cache);
1834         lwtunnel_set_redirect(&rth->dst);
1835         skb_dst_set(skb, &rth->dst);
1836 out:
1837         err = 0;
1838  cleanup:
1839         return err;
1840 }
1841
1842 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1843 /* To make ICMP packets follow the right flow, the multipath hash is
1844  * calculated from the inner IP addresses.
1845  */
1846 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1847                                  struct flow_keys *hash_keys)
1848 {
1849         const struct iphdr *outer_iph = ip_hdr(skb);
1850         const struct iphdr *key_iph = outer_iph;
1851         const struct iphdr *inner_iph;
1852         const struct icmphdr *icmph;
1853         struct iphdr _inner_iph;
1854         struct icmphdr _icmph;
1855
1856         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1857                 goto out;
1858
1859         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1860                 goto out;
1861
1862         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1863                                    &_icmph);
1864         if (!icmph)
1865                 goto out;
1866
1867         if (icmph->type != ICMP_DEST_UNREACH &&
1868             icmph->type != ICMP_REDIRECT &&
1869             icmph->type != ICMP_TIME_EXCEEDED &&
1870             icmph->type != ICMP_PARAMETERPROB)
1871                 goto out;
1872
1873         inner_iph = skb_header_pointer(skb,
1874                                        outer_iph->ihl * 4 + sizeof(_icmph),
1875                                        sizeof(_inner_iph), &_inner_iph);
1876         if (!inner_iph)
1877                 goto out;
1878
1879         key_iph = inner_iph;
1880 out:
1881         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1882         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1883 }
1884
1885 /* if skb is set it will be used and fl4 can be NULL */
1886 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1887                        const struct sk_buff *skb, struct flow_keys *flkeys)
1888 {
1889         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1890         struct flow_keys hash_keys;
1891         u32 mhash;
1892
1893         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1894         case 0:
1895                 memset(&hash_keys, 0, sizeof(hash_keys));
1896                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1897                 if (skb) {
1898                         ip_multipath_l3_keys(skb, &hash_keys);
1899                 } else {
1900                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1901                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1902                 }
1903                 break;
1904         case 1:
1905                 /* skb is currently provided only when forwarding */
1906                 if (skb) {
1907                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1908                         struct flow_keys keys;
1909
1910                         /* short-circuit if we already have L4 hash present */
1911                         if (skb->l4_hash)
1912                                 return skb_get_hash_raw(skb) >> 1;
1913
1914                         memset(&hash_keys, 0, sizeof(hash_keys));
1915
1916                         if (!flkeys) {
1917                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1918                                 flkeys = &keys;
1919                         }
1920
1921                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1922                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1923                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1924                         hash_keys.ports.src = flkeys->ports.src;
1925                         hash_keys.ports.dst = flkeys->ports.dst;
1926                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1927                 } else {
1928                         memset(&hash_keys, 0, sizeof(hash_keys));
1929                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1930                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1931                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1932                         hash_keys.ports.src = fl4->fl4_sport;
1933                         hash_keys.ports.dst = fl4->fl4_dport;
1934                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1935                 }
1936                 break;
1937         }
1938         mhash = flow_hash_from_keys(&hash_keys);
1939
1940         if (multipath_hash)
1941                 mhash = jhash_2words(mhash, multipath_hash, 0);
1942
1943         return mhash >> 1;
1944 }
1945 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1946
1947 static int ip_mkroute_input(struct sk_buff *skb,
1948                             struct fib_result *res,
1949                             struct in_device *in_dev,
1950                             __be32 daddr, __be32 saddr, u32 tos,
1951                             struct flow_keys *hkeys)
1952 {
1953 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1954         if (res->fi && fib_info_num_path(res->fi) > 1) {
1955                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1956
1957                 fib_select_multipath(res, h);
1958         }
1959 #endif
1960
1961         /* create a routing cache entry */
1962         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1963 }
1964
1965 /*
1966  *      NOTE. We drop all the packets that has local source
1967  *      addresses, because every properly looped back packet
1968  *      must have correct destination already attached by output routine.
1969  *
1970  *      Such approach solves two big problems:
1971  *      1. Not simplex devices are handled properly.
1972  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1973  *      called with rcu_read_lock()
1974  */
1975
1976 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1977                                u8 tos, struct net_device *dev,
1978                                struct fib_result *res)
1979 {
1980         struct in_device *in_dev = __in_dev_get_rcu(dev);
1981         struct flow_keys *flkeys = NULL, _flkeys;
1982         struct net    *net = dev_net(dev);
1983         struct ip_tunnel_info *tun_info;
1984         int             err = -EINVAL;
1985         unsigned int    flags = 0;
1986         u32             itag = 0;
1987         struct rtable   *rth;
1988         struct flowi4   fl4;
1989         bool do_cache;
1990
1991         /* IP on this device is disabled. */
1992
1993         if (!in_dev)
1994                 goto out;
1995
1996         /* Check for the most weird martians, which can be not detected
1997            by fib_lookup.
1998          */
1999
2000         tun_info = skb_tunnel_info(skb);
2001         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2002                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2003         else
2004                 fl4.flowi4_tun_key.tun_id = 0;
2005         skb_dst_drop(skb);
2006
2007         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2008                 goto martian_source;
2009
2010         res->fi = NULL;
2011         res->table = NULL;
2012         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2013                 goto brd_input;
2014
2015         /* Accept zero addresses only to limited broadcast;
2016          * I even do not know to fix it or not. Waiting for complains :-)
2017          */
2018         if (ipv4_is_zeronet(saddr))
2019                 goto martian_source;
2020
2021         if (ipv4_is_zeronet(daddr))
2022                 goto martian_destination;
2023
2024         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2025          * and call it once if daddr or/and saddr are loopback addresses
2026          */
2027         if (ipv4_is_loopback(daddr)) {
2028                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2029                         goto martian_destination;
2030         } else if (ipv4_is_loopback(saddr)) {
2031                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2032                         goto martian_source;
2033         }
2034
2035         /*
2036          *      Now we are ready to route packet.
2037          */
2038         fl4.flowi4_oif = 0;
2039         fl4.flowi4_iif = dev->ifindex;
2040         fl4.flowi4_mark = skb->mark;
2041         fl4.flowi4_tos = tos;
2042         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2043         fl4.flowi4_flags = 0;
2044         fl4.daddr = daddr;
2045         fl4.saddr = saddr;
2046         fl4.flowi4_uid = sock_net_uid(net, NULL);
2047
2048         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2049                 flkeys = &_flkeys;
2050         } else {
2051                 fl4.flowi4_proto = 0;
2052                 fl4.fl4_sport = 0;
2053                 fl4.fl4_dport = 0;
2054         }
2055
2056         err = fib_lookup(net, &fl4, res, 0);
2057         if (err != 0) {
2058                 if (!IN_DEV_FORWARD(in_dev))
2059                         err = -EHOSTUNREACH;
2060                 goto no_route;
2061         }
2062
2063         if (res->type == RTN_BROADCAST) {
2064                 if (IN_DEV_BFORWARD(in_dev))
2065                         goto make_route;
2066                 goto brd_input;
2067         }
2068
2069         if (res->type == RTN_LOCAL) {
2070                 err = fib_validate_source(skb, saddr, daddr, tos,
2071                                           0, dev, in_dev, &itag);
2072                 if (err < 0)
2073                         goto martian_source;
2074                 goto local_input;
2075         }
2076
2077         if (!IN_DEV_FORWARD(in_dev)) {
2078                 err = -EHOSTUNREACH;
2079                 goto no_route;
2080         }
2081         if (res->type != RTN_UNICAST)
2082                 goto martian_destination;
2083
2084 make_route:
2085         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2086 out:    return err;
2087
2088 brd_input:
2089         if (skb->protocol != htons(ETH_P_IP))
2090                 goto e_inval;
2091
2092         if (!ipv4_is_zeronet(saddr)) {
2093                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2094                                           in_dev, &itag);
2095                 if (err < 0)
2096                         goto martian_source;
2097         }
2098         flags |= RTCF_BROADCAST;
2099         res->type = RTN_BROADCAST;
2100         RT_CACHE_STAT_INC(in_brd);
2101
2102 local_input:
2103         do_cache = false;
2104         if (res->fi) {
2105                 if (!itag) {
2106                         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2107
2108                         rth = rcu_dereference(nhc->nhc_rth_input);
2109                         if (rt_cache_valid(rth)) {
2110                                 skb_dst_set_noref(skb, &rth->dst);
2111                                 err = 0;
2112                                 goto out;
2113                         }
2114                         do_cache = true;
2115                 }
2116         }
2117
2118         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2119                            flags | RTCF_LOCAL, res->type,
2120                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2121         if (!rth)
2122                 goto e_nobufs;
2123
2124         rth->dst.output= ip_rt_bug;
2125 #ifdef CONFIG_IP_ROUTE_CLASSID
2126         rth->dst.tclassid = itag;
2127 #endif
2128         rth->rt_is_input = 1;
2129
2130         RT_CACHE_STAT_INC(in_slow_tot);
2131         if (res->type == RTN_UNREACHABLE) {
2132                 rth->dst.input= ip_error;
2133                 rth->dst.error= -err;
2134                 rth->rt_flags   &= ~RTCF_LOCAL;
2135         }
2136
2137         if (do_cache) {
2138                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2139
2140                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2141                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2142                         WARN_ON(rth->dst.input == lwtunnel_input);
2143                         rth->dst.lwtstate->orig_input = rth->dst.input;
2144                         rth->dst.input = lwtunnel_input;
2145                 }
2146
2147                 if (unlikely(!rt_cache_route(nhc, rth)))
2148                         rt_add_uncached_list(rth);
2149         }
2150         skb_dst_set(skb, &rth->dst);
2151         err = 0;
2152         goto out;
2153
2154 no_route:
2155         RT_CACHE_STAT_INC(in_no_route);
2156         res->type = RTN_UNREACHABLE;
2157         res->fi = NULL;
2158         res->table = NULL;
2159         goto local_input;
2160
2161         /*
2162          *      Do not cache martian addresses: they should be logged (RFC1812)
2163          */
2164 martian_destination:
2165         RT_CACHE_STAT_INC(in_martian_dst);
2166 #ifdef CONFIG_IP_ROUTE_VERBOSE
2167         if (IN_DEV_LOG_MARTIANS(in_dev))
2168                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2169                                      &daddr, &saddr, dev->name);
2170 #endif
2171
2172 e_inval:
2173         err = -EINVAL;
2174         goto out;
2175
2176 e_nobufs:
2177         err = -ENOBUFS;
2178         goto out;
2179
2180 martian_source:
2181         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2182         goto out;
2183 }
2184
2185 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2186                          u8 tos, struct net_device *dev)
2187 {
2188         struct fib_result res;
2189         int err;
2190
2191         tos &= IPTOS_RT_MASK;
2192         rcu_read_lock();
2193         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2194         rcu_read_unlock();
2195
2196         return err;
2197 }
2198 EXPORT_SYMBOL(ip_route_input_noref);
2199
2200 /* called with rcu_read_lock held */
2201 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2202                        u8 tos, struct net_device *dev, struct fib_result *res)
2203 {
2204         /* Multicast recognition logic is moved from route cache to here.
2205            The problem was that too many Ethernet cards have broken/missing
2206            hardware multicast filters :-( As result the host on multicasting
2207            network acquires a lot of useless route cache entries, sort of
2208            SDR messages from all the world. Now we try to get rid of them.
2209            Really, provided software IP multicast filter is organized
2210            reasonably (at least, hashed), it does not result in a slowdown
2211            comparing with route cache reject entries.
2212            Note, that multicast routers are not affected, because
2213            route cache entry is created eventually.
2214          */
2215         if (ipv4_is_multicast(daddr)) {
2216                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2217                 int our = 0;
2218                 int err = -EINVAL;
2219
2220                 if (!in_dev)
2221                         return err;
2222                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2223                                       ip_hdr(skb)->protocol);
2224
2225                 /* check l3 master if no match yet */
2226                 if (!our && netif_is_l3_slave(dev)) {
2227                         struct in_device *l3_in_dev;
2228
2229                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2230                         if (l3_in_dev)
2231                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2232                                                       ip_hdr(skb)->protocol);
2233                 }
2234
2235                 if (our
2236 #ifdef CONFIG_IP_MROUTE
2237                         ||
2238                     (!ipv4_is_local_multicast(daddr) &&
2239                      IN_DEV_MFORWARD(in_dev))
2240 #endif
2241                    ) {
2242                         err = ip_route_input_mc(skb, daddr, saddr,
2243                                                 tos, dev, our);
2244                 }
2245                 return err;
2246         }
2247
2248         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2249 }
2250
2251 /* called with rcu_read_lock() */
2252 static struct rtable *__mkroute_output(const struct fib_result *res,
2253                                        const struct flowi4 *fl4, int orig_oif,
2254                                        struct net_device *dev_out,
2255                                        unsigned int flags)
2256 {
2257         struct fib_info *fi = res->fi;
2258         struct fib_nh_exception *fnhe;
2259         struct in_device *in_dev;
2260         u16 type = res->type;
2261         struct rtable *rth;
2262         bool do_cache;
2263
2264         in_dev = __in_dev_get_rcu(dev_out);
2265         if (!in_dev)
2266                 return ERR_PTR(-EINVAL);
2267
2268         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2269                 if (ipv4_is_loopback(fl4->saddr) &&
2270                     !(dev_out->flags & IFF_LOOPBACK) &&
2271                     !netif_is_l3_master(dev_out))
2272                         return ERR_PTR(-EINVAL);
2273
2274         if (ipv4_is_lbcast(fl4->daddr))
2275                 type = RTN_BROADCAST;
2276         else if (ipv4_is_multicast(fl4->daddr))
2277                 type = RTN_MULTICAST;
2278         else if (ipv4_is_zeronet(fl4->daddr))
2279                 return ERR_PTR(-EINVAL);
2280
2281         if (dev_out->flags & IFF_LOOPBACK)
2282                 flags |= RTCF_LOCAL;
2283
2284         do_cache = true;
2285         if (type == RTN_BROADCAST) {
2286                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2287                 fi = NULL;
2288         } else if (type == RTN_MULTICAST) {
2289                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2290                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2291                                      fl4->flowi4_proto))
2292                         flags &= ~RTCF_LOCAL;
2293                 else
2294                         do_cache = false;
2295                 /* If multicast route do not exist use
2296                  * default one, but do not gateway in this case.
2297                  * Yes, it is hack.
2298                  */
2299                 if (fi && res->prefixlen < 4)
2300                         fi = NULL;
2301         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2302                    (orig_oif != dev_out->ifindex)) {
2303                 /* For local routes that require a particular output interface
2304                  * we do not want to cache the result.  Caching the result
2305                  * causes incorrect behaviour when there are multiple source
2306                  * addresses on the interface, the end result being that if the
2307                  * intended recipient is waiting on that interface for the
2308                  * packet he won't receive it because it will be delivered on
2309                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2310                  * be set to the loopback interface as well.
2311                  */
2312                 do_cache = false;
2313         }
2314
2315         fnhe = NULL;
2316         do_cache &= fi != NULL;
2317         if (fi) {
2318                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2319                 struct rtable __rcu **prth;
2320
2321                 fnhe = find_exception(nhc, fl4->daddr);
2322                 if (!do_cache)
2323                         goto add;
2324                 if (fnhe) {
2325                         prth = &fnhe->fnhe_rth_output;
2326                 } else {
2327                         if (unlikely(fl4->flowi4_flags &
2328                                      FLOWI_FLAG_KNOWN_NH &&
2329                                      !(nhc->nhc_gw_family &&
2330                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2331                                 do_cache = false;
2332                                 goto add;
2333                         }
2334                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2335                 }
2336                 rth = rcu_dereference(*prth);
2337                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2338                         return rth;
2339         }
2340
2341 add:
2342         rth = rt_dst_alloc(dev_out, flags, type,
2343                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2344                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2345                            do_cache);
2346         if (!rth)
2347                 return ERR_PTR(-ENOBUFS);
2348
2349         rth->rt_iif = orig_oif;
2350
2351         RT_CACHE_STAT_INC(out_slow_tot);
2352
2353         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2354                 if (flags & RTCF_LOCAL &&
2355                     !(dev_out->flags & IFF_LOOPBACK)) {
2356                         rth->dst.output = ip_mc_output;
2357                         RT_CACHE_STAT_INC(out_slow_mc);
2358                 }
2359 #ifdef CONFIG_IP_MROUTE
2360                 if (type == RTN_MULTICAST) {
2361                         if (IN_DEV_MFORWARD(in_dev) &&
2362                             !ipv4_is_local_multicast(fl4->daddr)) {
2363                                 rth->dst.input = ip_mr_input;
2364                                 rth->dst.output = ip_mc_output;
2365                         }
2366                 }
2367 #endif
2368         }
2369
2370         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2371         lwtunnel_set_redirect(&rth->dst);
2372
2373         return rth;
2374 }
2375
2376 /*
2377  * Major route resolver routine.
2378  */
2379
2380 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2381                                         const struct sk_buff *skb)
2382 {
2383         __u8 tos = RT_FL_TOS(fl4);
2384         struct fib_result res = {
2385                 .type           = RTN_UNSPEC,
2386                 .fi             = NULL,
2387                 .table          = NULL,
2388                 .tclassid       = 0,
2389         };
2390         struct rtable *rth;
2391
2392         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2393         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2394         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2395                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2396
2397         rcu_read_lock();
2398         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2399         rcu_read_unlock();
2400
2401         return rth;
2402 }
2403 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2404
2405 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2406                                             struct fib_result *res,
2407                                             const struct sk_buff *skb)
2408 {
2409         struct net_device *dev_out = NULL;
2410         int orig_oif = fl4->flowi4_oif;
2411         unsigned int flags = 0;
2412         struct rtable *rth;
2413         int err = -ENETUNREACH;
2414
2415         if (fl4->saddr) {
2416                 rth = ERR_PTR(-EINVAL);
2417                 if (ipv4_is_multicast(fl4->saddr) ||
2418                     ipv4_is_lbcast(fl4->saddr) ||
2419                     ipv4_is_zeronet(fl4->saddr))
2420                         goto out;
2421
2422                 /* I removed check for oif == dev_out->oif here.
2423                    It was wrong for two reasons:
2424                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2425                       is assigned to multiple interfaces.
2426                    2. Moreover, we are allowed to send packets with saddr
2427                       of another iface. --ANK
2428                  */
2429
2430                 if (fl4->flowi4_oif == 0 &&
2431                     (ipv4_is_multicast(fl4->daddr) ||
2432                      ipv4_is_lbcast(fl4->daddr))) {
2433                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2434                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2435                         if (!dev_out)
2436                                 goto out;
2437
2438                         /* Special hack: user can direct multicasts
2439                            and limited broadcast via necessary interface
2440                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2441                            This hack is not just for fun, it allows
2442                            vic,vat and friends to work.
2443                            They bind socket to loopback, set ttl to zero
2444                            and expect that it will work.
2445                            From the viewpoint of routing cache they are broken,
2446                            because we are not allowed to build multicast path
2447                            with loopback source addr (look, routing cache
2448                            cannot know, that ttl is zero, so that packet
2449                            will not leave this host and route is valid).
2450                            Luckily, this hack is good workaround.
2451                          */
2452
2453                         fl4->flowi4_oif = dev_out->ifindex;
2454                         goto make_route;
2455                 }
2456
2457                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2458                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2459                         if (!__ip_dev_find(net, fl4->saddr, false))
2460                                 goto out;
2461                 }
2462         }
2463
2464
2465         if (fl4->flowi4_oif) {
2466                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2467                 rth = ERR_PTR(-ENODEV);
2468                 if (!dev_out)
2469                         goto out;
2470
2471                 /* RACE: Check return value of inet_select_addr instead. */
2472                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2473                         rth = ERR_PTR(-ENETUNREACH);
2474                         goto out;
2475                 }
2476                 if (ipv4_is_local_multicast(fl4->daddr) ||
2477                     ipv4_is_lbcast(fl4->daddr) ||
2478                     fl4->flowi4_proto == IPPROTO_IGMP) {
2479                         if (!fl4->saddr)
2480                                 fl4->saddr = inet_select_addr(dev_out, 0,
2481                                                               RT_SCOPE_LINK);
2482                         goto make_route;
2483                 }
2484                 if (!fl4->saddr) {
2485                         if (ipv4_is_multicast(fl4->daddr))
2486                                 fl4->saddr = inet_select_addr(dev_out, 0,
2487                                                               fl4->flowi4_scope);
2488                         else if (!fl4->daddr)
2489                                 fl4->saddr = inet_select_addr(dev_out, 0,
2490                                                               RT_SCOPE_HOST);
2491                 }
2492         }
2493
2494         if (!fl4->daddr) {
2495                 fl4->daddr = fl4->saddr;
2496                 if (!fl4->daddr)
2497                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2498                 dev_out = net->loopback_dev;
2499                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2500                 res->type = RTN_LOCAL;
2501                 flags |= RTCF_LOCAL;
2502                 goto make_route;
2503         }
2504
2505         err = fib_lookup(net, fl4, res, 0);
2506         if (err) {
2507                 res->fi = NULL;
2508                 res->table = NULL;
2509                 if (fl4->flowi4_oif &&
2510                     (ipv4_is_multicast(fl4->daddr) ||
2511                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2512                         /* Apparently, routing tables are wrong. Assume,
2513                            that the destination is on link.
2514
2515                            WHY? DW.
2516                            Because we are allowed to send to iface
2517                            even if it has NO routes and NO assigned
2518                            addresses. When oif is specified, routing
2519                            tables are looked up with only one purpose:
2520                            to catch if destination is gatewayed, rather than
2521                            direct. Moreover, if MSG_DONTROUTE is set,
2522                            we send packet, ignoring both routing tables
2523                            and ifaddr state. --ANK
2524
2525
2526                            We could make it even if oif is unknown,
2527                            likely IPv6, but we do not.
2528                          */
2529
2530                         if (fl4->saddr == 0)
2531                                 fl4->saddr = inet_select_addr(dev_out, 0,
2532                                                               RT_SCOPE_LINK);
2533                         res->type = RTN_UNICAST;
2534                         goto make_route;
2535                 }
2536                 rth = ERR_PTR(err);
2537                 goto out;
2538         }
2539
2540         if (res->type == RTN_LOCAL) {
2541                 if (!fl4->saddr) {
2542                         if (res->fi->fib_prefsrc)
2543                                 fl4->saddr = res->fi->fib_prefsrc;
2544                         else
2545                                 fl4->saddr = fl4->daddr;
2546                 }
2547
2548                 /* L3 master device is the loopback for that domain */
2549                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2550                         net->loopback_dev;
2551
2552                 /* make sure orig_oif points to fib result device even
2553                  * though packet rx/tx happens over loopback or l3mdev
2554                  */
2555                 orig_oif = FIB_RES_OIF(*res);
2556
2557                 fl4->flowi4_oif = dev_out->ifindex;
2558                 flags |= RTCF_LOCAL;
2559                 goto make_route;
2560         }
2561
2562         fib_select_path(net, res, fl4, skb);
2563
2564         dev_out = FIB_RES_DEV(*res);
2565         fl4->flowi4_oif = dev_out->ifindex;
2566
2567
2568 make_route:
2569         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2570
2571 out:
2572         return rth;
2573 }
2574
2575 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2576 {
2577         return NULL;
2578 }
2579
2580 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2581 {
2582         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2583
2584         return mtu ? : dst->dev->mtu;
2585 }
2586
2587 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2588                                           struct sk_buff *skb, u32 mtu)
2589 {
2590 }
2591
2592 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2593                                        struct sk_buff *skb)
2594 {
2595 }
2596
2597 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2598                                           unsigned long old)
2599 {
2600         return NULL;
2601 }
2602
2603 static struct dst_ops ipv4_dst_blackhole_ops = {
2604         .family                 =       AF_INET,
2605         .check                  =       ipv4_blackhole_dst_check,
2606         .mtu                    =       ipv4_blackhole_mtu,
2607         .default_advmss         =       ipv4_default_advmss,
2608         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2609         .redirect               =       ipv4_rt_blackhole_redirect,
2610         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2611         .neigh_lookup           =       ipv4_neigh_lookup,
2612 };
2613
2614 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2615 {
2616         struct rtable *ort = (struct rtable *) dst_orig;
2617         struct rtable *rt;
2618
2619         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2620         if (rt) {
2621                 struct dst_entry *new = &rt->dst;
2622
2623                 new->__use = 1;
2624                 new->input = dst_discard;
2625                 new->output = dst_discard_out;
2626
2627                 new->dev = net->loopback_dev;
2628                 if (new->dev)
2629                         dev_hold(new->dev);
2630
2631                 rt->rt_is_input = ort->rt_is_input;
2632                 rt->rt_iif = ort->rt_iif;
2633                 rt->rt_pmtu = ort->rt_pmtu;
2634                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2635
2636                 rt->rt_genid = rt_genid_ipv4(net);
2637                 rt->rt_flags = ort->rt_flags;
2638                 rt->rt_type = ort->rt_type;
2639                 rt->rt_gw_family = ort->rt_gw_family;
2640                 if (rt->rt_gw_family == AF_INET)
2641                         rt->rt_gw4 = ort->rt_gw4;
2642                 else if (rt->rt_gw_family == AF_INET6)
2643                         rt->rt_gw6 = ort->rt_gw6;
2644
2645                 INIT_LIST_HEAD(&rt->rt_uncached);
2646         }
2647
2648         dst_release(dst_orig);
2649
2650         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2651 }
2652
2653 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2654                                     const struct sock *sk)
2655 {
2656         struct rtable *rt = __ip_route_output_key(net, flp4);
2657
2658         if (IS_ERR(rt))
2659                 return rt;
2660
2661         if (flp4->flowi4_proto)
2662                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2663                                                         flowi4_to_flowi(flp4),
2664                                                         sk, 0);
2665
2666         return rt;
2667 }
2668 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2669
2670 /* called with rcu_read_lock held */
2671 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2672                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2673                         struct sk_buff *skb, u32 portid, u32 seq)
2674 {
2675         struct rtmsg *r;
2676         struct nlmsghdr *nlh;
2677         unsigned long expires = 0;
2678         u32 error;
2679         u32 metrics[RTAX_MAX];
2680
2681         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2682         if (!nlh)
2683                 return -EMSGSIZE;
2684
2685         r = nlmsg_data(nlh);
2686         r->rtm_family    = AF_INET;
2687         r->rtm_dst_len  = 32;
2688         r->rtm_src_len  = 0;
2689         r->rtm_tos      = fl4->flowi4_tos;
2690         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2691         if (nla_put_u32(skb, RTA_TABLE, table_id))
2692                 goto nla_put_failure;
2693         r->rtm_type     = rt->rt_type;
2694         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2695         r->rtm_protocol = RTPROT_UNSPEC;
2696         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2697         if (rt->rt_flags & RTCF_NOTIFY)
2698                 r->rtm_flags |= RTM_F_NOTIFY;
2699         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2700                 r->rtm_flags |= RTCF_DOREDIRECT;
2701
2702         if (nla_put_in_addr(skb, RTA_DST, dst))
2703                 goto nla_put_failure;
2704         if (src) {
2705                 r->rtm_src_len = 32;
2706                 if (nla_put_in_addr(skb, RTA_SRC, src))
2707                         goto nla_put_failure;
2708         }
2709         if (rt->dst.dev &&
2710             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2711                 goto nla_put_failure;
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713         if (rt->dst.tclassid &&
2714             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2715                 goto nla_put_failure;
2716 #endif
2717         if (!rt_is_input_route(rt) &&
2718             fl4->saddr != src) {
2719                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2720                         goto nla_put_failure;
2721         }
2722         if (rt->rt_gw_family == AF_INET &&
2723             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2724                 goto nla_put_failure;
2725         } else if (rt->rt_gw_family == AF_INET6) {
2726                 int alen = sizeof(struct in6_addr);
2727                 struct nlattr *nla;
2728                 struct rtvia *via;
2729
2730                 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2731                 if (!nla)
2732                         goto nla_put_failure;
2733
2734                 via = nla_data(nla);
2735                 via->rtvia_family = AF_INET6;
2736                 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2737         }
2738
2739         expires = rt->dst.expires;
2740         if (expires) {
2741                 unsigned long now = jiffies;
2742
2743                 if (time_before(now, expires))
2744                         expires -= now;
2745                 else
2746                         expires = 0;
2747         }
2748
2749         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2750         if (rt->rt_pmtu && expires)
2751                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2752         if (rt->rt_mtu_locked && expires)
2753                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2754         if (rtnetlink_put_metrics(skb, metrics) < 0)
2755                 goto nla_put_failure;
2756
2757         if (fl4->flowi4_mark &&
2758             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2759                 goto nla_put_failure;
2760
2761         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2762             nla_put_u32(skb, RTA_UID,
2763                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2764                 goto nla_put_failure;
2765
2766         error = rt->dst.error;
2767
2768         if (rt_is_input_route(rt)) {
2769 #ifdef CONFIG_IP_MROUTE
2770                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2771                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2772                         int err = ipmr_get_route(net, skb,
2773                                                  fl4->saddr, fl4->daddr,
2774                                                  r, portid);
2775
2776                         if (err <= 0) {
2777                                 if (err == 0)
2778                                         return 0;
2779                                 goto nla_put_failure;
2780                         }
2781                 } else
2782 #endif
2783                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2784                                 goto nla_put_failure;
2785         }
2786
2787         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2788                 goto nla_put_failure;
2789
2790         nlmsg_end(skb, nlh);
2791         return 0;
2792
2793 nla_put_failure:
2794         nlmsg_cancel(skb, nlh);
2795         return -EMSGSIZE;
2796 }
2797
2798 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2799                                                    u8 ip_proto, __be16 sport,
2800                                                    __be16 dport)
2801 {
2802         struct sk_buff *skb;
2803         struct iphdr *iph;
2804
2805         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2806         if (!skb)
2807                 return NULL;
2808
2809         /* Reserve room for dummy headers, this skb can pass
2810          * through good chunk of routing engine.
2811          */
2812         skb_reset_mac_header(skb);
2813         skb_reset_network_header(skb);
2814         skb->protocol = htons(ETH_P_IP);
2815         iph = skb_put(skb, sizeof(struct iphdr));
2816         iph->protocol = ip_proto;
2817         iph->saddr = src;
2818         iph->daddr = dst;
2819         iph->version = 0x4;
2820         iph->frag_off = 0;
2821         iph->ihl = 0x5;
2822         skb_set_transport_header(skb, skb->len);
2823
2824         switch (iph->protocol) {
2825         case IPPROTO_UDP: {
2826                 struct udphdr *udph;
2827
2828                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2829                 udph->source = sport;
2830                 udph->dest = dport;
2831                 udph->len = sizeof(struct udphdr);
2832                 udph->check = 0;
2833                 break;
2834         }
2835         case IPPROTO_TCP: {
2836                 struct tcphdr *tcph;
2837
2838                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2839                 tcph->source    = sport;
2840                 tcph->dest      = dport;
2841                 tcph->doff      = sizeof(struct tcphdr) / 4;
2842                 tcph->rst = 1;
2843                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2844                                             src, dst, 0);
2845                 break;
2846         }
2847         case IPPROTO_ICMP: {
2848                 struct icmphdr *icmph;
2849
2850                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2851                 icmph->type = ICMP_ECHO;
2852                 icmph->code = 0;
2853         }
2854         }
2855
2856         return skb;
2857 }
2858
2859 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2860                                        const struct nlmsghdr *nlh,
2861                                        struct nlattr **tb,
2862                                        struct netlink_ext_ack *extack)
2863 {
2864         struct rtmsg *rtm;
2865         int i, err;
2866
2867         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2868                 NL_SET_ERR_MSG(extack,
2869                                "ipv4: Invalid header for route get request");
2870                 return -EINVAL;
2871         }
2872
2873         if (!netlink_strict_get_check(skb))
2874                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2875                                               rtm_ipv4_policy, extack);
2876
2877         rtm = nlmsg_data(nlh);
2878         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2879             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2880             rtm->rtm_table || rtm->rtm_protocol ||
2881             rtm->rtm_scope || rtm->rtm_type) {
2882                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2883                 return -EINVAL;
2884         }
2885
2886         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2887                                RTM_F_LOOKUP_TABLE |
2888                                RTM_F_FIB_MATCH)) {
2889                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2890                 return -EINVAL;
2891         }
2892
2893         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2894                                             rtm_ipv4_policy, extack);
2895         if (err)
2896                 return err;
2897
2898         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2899             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2900                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2901                 return -EINVAL;
2902         }
2903
2904         for (i = 0; i <= RTA_MAX; i++) {
2905                 if (!tb[i])
2906                         continue;
2907
2908                 switch (i) {
2909                 case RTA_IIF:
2910                 case RTA_OIF:
2911                 case RTA_SRC:
2912                 case RTA_DST:
2913                 case RTA_IP_PROTO:
2914                 case RTA_SPORT:
2915                 case RTA_DPORT:
2916                 case RTA_MARK:
2917                 case RTA_UID:
2918                         break;
2919                 default:
2920                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2921                         return -EINVAL;
2922                 }
2923         }
2924
2925         return 0;
2926 }
2927
2928 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2929                              struct netlink_ext_ack *extack)
2930 {
2931         struct net *net = sock_net(in_skb->sk);
2932         struct nlattr *tb[RTA_MAX+1];
2933         u32 table_id = RT_TABLE_MAIN;
2934         __be16 sport = 0, dport = 0;
2935         struct fib_result res = {};
2936         u8 ip_proto = IPPROTO_UDP;
2937         struct rtable *rt = NULL;
2938         struct sk_buff *skb;
2939         struct rtmsg *rtm;
2940         struct flowi4 fl4 = {};
2941         __be32 dst = 0;
2942         __be32 src = 0;
2943         kuid_t uid;
2944         u32 iif;
2945         int err;
2946         int mark;
2947
2948         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2949         if (err < 0)
2950                 return err;
2951
2952         rtm = nlmsg_data(nlh);
2953         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2954         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2955         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2956         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2957         if (tb[RTA_UID])
2958                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2959         else
2960                 uid = (iif ? INVALID_UID : current_uid());
2961
2962         if (tb[RTA_IP_PROTO]) {
2963                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2964                                                   &ip_proto, AF_INET, extack);
2965                 if (err)
2966                         return err;
2967         }
2968
2969         if (tb[RTA_SPORT])
2970                 sport = nla_get_be16(tb[RTA_SPORT]);
2971
2972         if (tb[RTA_DPORT])
2973                 dport = nla_get_be16(tb[RTA_DPORT]);
2974
2975         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2976         if (!skb)
2977                 return -ENOBUFS;
2978
2979         fl4.daddr = dst;
2980         fl4.saddr = src;
2981         fl4.flowi4_tos = rtm->rtm_tos;
2982         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2983         fl4.flowi4_mark = mark;
2984         fl4.flowi4_uid = uid;
2985         if (sport)
2986                 fl4.fl4_sport = sport;
2987         if (dport)
2988                 fl4.fl4_dport = dport;
2989         fl4.flowi4_proto = ip_proto;
2990
2991         rcu_read_lock();
2992
2993         if (iif) {
2994                 struct net_device *dev;
2995
2996                 dev = dev_get_by_index_rcu(net, iif);
2997                 if (!dev) {
2998                         err = -ENODEV;
2999                         goto errout_rcu;
3000                 }
3001
3002                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3003                 skb->dev        = dev;
3004                 skb->mark       = mark;
3005                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3006                                          dev, &res);
3007
3008                 rt = skb_rtable(skb);
3009                 if (err == 0 && rt->dst.error)
3010                         err = -rt->dst.error;
3011         } else {
3012                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3013                 skb->dev = net->loopback_dev;
3014                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3015                 err = 0;
3016                 if (IS_ERR(rt))
3017                         err = PTR_ERR(rt);
3018                 else
3019                         skb_dst_set(skb, &rt->dst);
3020         }
3021
3022         if (err)
3023                 goto errout_rcu;
3024
3025         if (rtm->rtm_flags & RTM_F_NOTIFY)
3026                 rt->rt_flags |= RTCF_NOTIFY;
3027
3028         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3029                 table_id = res.table ? res.table->tb_id : 0;
3030
3031         /* reset skb for netlink reply msg */
3032         skb_trim(skb, 0);
3033         skb_reset_network_header(skb);
3034         skb_reset_transport_header(skb);
3035         skb_reset_mac_header(skb);
3036
3037         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3038                 if (!res.fi) {
3039                         err = fib_props[res.type].error;
3040                         if (!err)
3041                                 err = -EHOSTUNREACH;
3042                         goto errout_rcu;
3043                 }
3044                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3045                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3046                                     rt->rt_type, res.prefix, res.prefixlen,
3047                                     fl4.flowi4_tos, res.fi, 0);
3048         } else {
3049                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3050                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3051         }
3052         if (err < 0)
3053                 goto errout_rcu;
3054
3055         rcu_read_unlock();
3056
3057         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3058
3059 errout_free:
3060         return err;
3061 errout_rcu:
3062         rcu_read_unlock();
3063         kfree_skb(skb);
3064         goto errout_free;
3065 }
3066
3067 void ip_rt_multicast_event(struct in_device *in_dev)
3068 {
3069         rt_cache_flush(dev_net(in_dev->dev));
3070 }
3071
3072 #ifdef CONFIG_SYSCTL
3073 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3074 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3075 static int ip_rt_gc_elasticity __read_mostly    = 8;
3076 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3077
3078 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3079                                         void __user *buffer,
3080                                         size_t *lenp, loff_t *ppos)
3081 {
3082         struct net *net = (struct net *)__ctl->extra1;
3083
3084         if (write) {
3085                 rt_cache_flush(net);
3086                 fnhe_genid_bump(net);
3087                 return 0;
3088         }
3089
3090         return -EINVAL;
3091 }
3092
3093 static struct ctl_table ipv4_route_table[] = {
3094         {
3095                 .procname       = "gc_thresh",
3096                 .data           = &ipv4_dst_ops.gc_thresh,
3097                 .maxlen         = sizeof(int),
3098                 .mode           = 0644,
3099                 .proc_handler   = proc_dointvec,
3100         },
3101         {
3102                 .procname       = "max_size",
3103                 .data           = &ip_rt_max_size,
3104                 .maxlen         = sizeof(int),
3105                 .mode           = 0644,
3106                 .proc_handler   = proc_dointvec,
3107         },
3108         {
3109                 /*  Deprecated. Use gc_min_interval_ms */
3110
3111                 .procname       = "gc_min_interval",
3112                 .data           = &ip_rt_gc_min_interval,
3113                 .maxlen         = sizeof(int),
3114                 .mode           = 0644,
3115                 .proc_handler   = proc_dointvec_jiffies,
3116         },
3117         {
3118                 .procname       = "gc_min_interval_ms",
3119                 .data           = &ip_rt_gc_min_interval,
3120                 .maxlen         = sizeof(int),
3121                 .mode           = 0644,
3122                 .proc_handler   = proc_dointvec_ms_jiffies,
3123         },
3124         {
3125                 .procname       = "gc_timeout",
3126                 .data           = &ip_rt_gc_timeout,
3127                 .maxlen         = sizeof(int),
3128                 .mode           = 0644,
3129                 .proc_handler   = proc_dointvec_jiffies,
3130         },
3131         {
3132                 .procname       = "gc_interval",
3133                 .data           = &ip_rt_gc_interval,
3134                 .maxlen         = sizeof(int),
3135                 .mode           = 0644,
3136                 .proc_handler   = proc_dointvec_jiffies,
3137         },
3138         {
3139                 .procname       = "redirect_load",
3140                 .data           = &ip_rt_redirect_load,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = proc_dointvec,
3144         },
3145         {
3146                 .procname       = "redirect_number",
3147                 .data           = &ip_rt_redirect_number,
3148                 .maxlen         = sizeof(int),
3149                 .mode           = 0644,
3150                 .proc_handler   = proc_dointvec,
3151         },
3152         {
3153                 .procname       = "redirect_silence",
3154                 .data           = &ip_rt_redirect_silence,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec,
3158         },
3159         {
3160                 .procname       = "error_cost",
3161                 .data           = &ip_rt_error_cost,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec,
3165         },
3166         {
3167                 .procname       = "error_burst",
3168                 .data           = &ip_rt_error_burst,
3169                 .maxlen         = sizeof(int),
3170                 .mode           = 0644,
3171                 .proc_handler   = proc_dointvec,
3172         },
3173         {
3174                 .procname       = "gc_elasticity",
3175                 .data           = &ip_rt_gc_elasticity,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "mtu_expires",
3182                 .data           = &ip_rt_mtu_expires,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec_jiffies,
3186         },
3187         {
3188                 .procname       = "min_pmtu",
3189                 .data           = &ip_rt_min_pmtu,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec_minmax,
3193                 .extra1         = &ip_min_valid_pmtu,
3194         },
3195         {
3196                 .procname       = "min_adv_mss",
3197                 .data           = &ip_rt_min_advmss,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         { }
3203 };
3204
3205 static struct ctl_table ipv4_route_flush_table[] = {
3206         {
3207                 .procname       = "flush",
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0200,
3210                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3211         },
3212         { },
3213 };
3214
3215 static __net_init int sysctl_route_net_init(struct net *net)
3216 {
3217         struct ctl_table *tbl;
3218
3219         tbl = ipv4_route_flush_table;
3220         if (!net_eq(net, &init_net)) {
3221                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3222                 if (!tbl)
3223                         goto err_dup;
3224
3225                 /* Don't export sysctls to unprivileged users */
3226                 if (net->user_ns != &init_user_ns)
3227                         tbl[0].procname = NULL;
3228         }
3229         tbl[0].extra1 = net;
3230
3231         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3232         if (!net->ipv4.route_hdr)
3233                 goto err_reg;
3234         return 0;
3235
3236 err_reg:
3237         if (tbl != ipv4_route_flush_table)
3238                 kfree(tbl);
3239 err_dup:
3240         return -ENOMEM;
3241 }
3242
3243 static __net_exit void sysctl_route_net_exit(struct net *net)
3244 {
3245         struct ctl_table *tbl;
3246
3247         tbl = net->ipv4.route_hdr->ctl_table_arg;
3248         unregister_net_sysctl_table(net->ipv4.route_hdr);
3249         BUG_ON(tbl == ipv4_route_flush_table);
3250         kfree(tbl);
3251 }
3252
3253 static __net_initdata struct pernet_operations sysctl_route_ops = {
3254         .init = sysctl_route_net_init,
3255         .exit = sysctl_route_net_exit,
3256 };
3257 #endif
3258
3259 static __net_init int rt_genid_init(struct net *net)
3260 {
3261         atomic_set(&net->ipv4.rt_genid, 0);
3262         atomic_set(&net->fnhe_genid, 0);
3263         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3264         return 0;
3265 }
3266
3267 static __net_initdata struct pernet_operations rt_genid_ops = {
3268         .init = rt_genid_init,
3269 };
3270
3271 static int __net_init ipv4_inetpeer_init(struct net *net)
3272 {
3273         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3274
3275         if (!bp)
3276                 return -ENOMEM;
3277         inet_peer_base_init(bp);
3278         net->ipv4.peers = bp;
3279         return 0;
3280 }
3281
3282 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3283 {
3284         struct inet_peer_base *bp = net->ipv4.peers;
3285
3286         net->ipv4.peers = NULL;
3287         inetpeer_invalidate_tree(bp);
3288         kfree(bp);
3289 }
3290
3291 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3292         .init   =       ipv4_inetpeer_init,
3293         .exit   =       ipv4_inetpeer_exit,
3294 };
3295
3296 #ifdef CONFIG_IP_ROUTE_CLASSID
3297 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3298 #endif /* CONFIG_IP_ROUTE_CLASSID */
3299
3300 int __init ip_rt_init(void)
3301 {
3302         int cpu;
3303
3304         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3305                                   GFP_KERNEL);
3306         if (!ip_idents)
3307                 panic("IP: failed to allocate ip_idents\n");
3308
3309         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3310
3311         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3312         if (!ip_tstamps)
3313                 panic("IP: failed to allocate ip_tstamps\n");
3314
3315         for_each_possible_cpu(cpu) {
3316                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3317
3318                 INIT_LIST_HEAD(&ul->head);
3319                 spin_lock_init(&ul->lock);
3320         }
3321 #ifdef CONFIG_IP_ROUTE_CLASSID
3322         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3323         if (!ip_rt_acct)
3324                 panic("IP: failed to allocate ip_rt_acct\n");
3325 #endif
3326
3327         ipv4_dst_ops.kmem_cachep =
3328                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3329                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3330
3331         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3332
3333         if (dst_entries_init(&ipv4_dst_ops) < 0)
3334                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3335
3336         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3337                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3338
3339         ipv4_dst_ops.gc_thresh = ~0;
3340         ip_rt_max_size = INT_MAX;
3341
3342         devinet_init();
3343         ip_fib_init();
3344
3345         if (ip_rt_proc_init())
3346                 pr_err("Unable to create route proc files\n");
3347 #ifdef CONFIG_XFRM
3348         xfrm_init();
3349         xfrm4_init();
3350 #endif
3351         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3352                       RTNL_FLAG_DOIT_UNLOCKED);
3353
3354 #ifdef CONFIG_SYSCTL
3355         register_pernet_subsys(&sysctl_route_ops);
3356 #endif
3357         register_pernet_subsys(&rt_genid_ops);
3358         register_pernet_subsys(&ipv4_inetpeer_ops);
3359         return 0;
3360 }
3361
3362 #ifdef CONFIG_SYSCTL
3363 /*
3364  * We really need to sanitize the damn ipv4 init order, then all
3365  * this nonsense will go away.
3366  */
3367 void __init ip_static_sysctl_init(void)
3368 {
3369         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3370 }
3371 #endif