net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <net/dst.h>
  93 #include <net/net_namespace.h>
  94 #include <net/protocol.h>
  95 #include <net/ip.h>
  96 #include <net/route.h>
  97 #include <net/inetpeer.h>
  98 #include <net/sock.h>
  99 #include <net/ip_fib.h>
 100 #include <net/arp.h>
 101 #include <net/tcp.h>
 102 #include <net/icmp.h>
 103 #include <net/xfrm.h>
 104 #include <net/netevent.h>
 105 #include <net/rtnetlink.h>
 106 #ifdef CONFIG_SYSCTL
 107 #include <linux/sysctl.h>
 108 #include <linux/kmemleak.h>
 109 #endif
 110 #include <net/secure_seq.h>
 111
 112 #define RT_FL_TOS(oldflp4) \
 113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                             int how)
 150 {
 151 }
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162
 163 static struct dst_ops ipv4_dst_ops = {
 164         .family =               AF_INET,
 165         .protocol =             cpu_to_be16(ETH_P_IP),
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .ifdown =               ipv4_dst_ifdown,
 172         .negative_advice =      ipv4_negative_advice,
 173         .link_failure =         ipv4_link_failure,
 174         .update_pmtu =          ip_rt_update_pmtu,
 175         .redirect =             ip_do_redirect,
 176         .local_out =            __ip_local_out,
 177         .neigh_lookup =         ipv4_neigh_lookup,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205 static inline int rt_genid(struct net *net)
 206 {
 207         return atomic_read(&net->ipv4.rt_genid);
 208 }
 209
 210 #ifdef CONFIG_PROC_FS
 211 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 212 {
 213         if (*pos)
 214                 return NULL;
 215         return SEQ_START_TOKEN;
 216 }
 217
 218 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 219 {
 220         ++*pos;
 221         return NULL;
 222 }
 223
 224 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 225 {
 226 }
 227
 228 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 229 {
 230         if (v == SEQ_START_TOKEN)
 231                 seq_printf(seq, "%-127s\n",
 232                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 233                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 234                            "HHUptod\tSpecDst");
 235         return 0;
 236 }
 237
 238 static const struct seq_operations rt_cache_seq_ops = {
 239         .start  = rt_cache_seq_start,
 240         .next   = rt_cache_seq_next,
 241         .stop   = rt_cache_seq_stop,
 242         .show   = rt_cache_seq_show,
 243 };
 244
 245 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 246 {
 247         return seq_open(file, &rt_cache_seq_ops);
 248 }
 249
 250 static const struct file_operations rt_cache_seq_fops = {
 251         .owner   = THIS_MODULE,
 252         .open    = rt_cache_seq_open,
 253         .read    = seq_read,
 254         .llseek  = seq_lseek,
 255         .release = seq_release,
 256 };
 257
 258
 259 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 260 {
 261         int cpu;
 262
 263         if (*pos == 0)
 264                 return SEQ_START_TOKEN;
 265
 266         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 267                 if (!cpu_possible(cpu))
 268                         continue;
 269                 *pos = cpu+1;
 270                 return &per_cpu(rt_cache_stat, cpu);
 271         }
 272         return NULL;
 273 }
 274
 275 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 276 {
 277         int cpu;
 278
 279         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 280                 if (!cpu_possible(cpu))
 281                         continue;
 282                 *pos = cpu+1;
 283                 return &per_cpu(rt_cache_stat, cpu);
 284         }
 285         return NULL;
 286
 287 }
 288
 289 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 290 {
 291
 292 }
 293
 294 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 295 {
 296         struct rt_cache_stat *st = v;
 297
 298         if (v == SEQ_START_TOKEN) {
 299                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 300                 return 0;
 301         }
 302
 303         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 304                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 305                    dst_entries_get_slow(&ipv4_dst_ops),
 306                    st->in_hit,
 307                    st->in_slow_tot,
 308                    st->in_slow_mc,
 309                    st->in_no_route,
 310                    st->in_brd,
 311                    st->in_martian_dst,
 312                    st->in_martian_src,
 313
 314                    st->out_hit,
 315                    st->out_slow_tot,
 316                    st->out_slow_mc,
 317
 318                    st->gc_total,
 319                    st->gc_ignored,
 320                    st->gc_goal_miss,
 321                    st->gc_dst_overflow,
 322                    st->in_hlist_search,
 323                    st->out_hlist_search
 324                 );
 325         return 0;
 326 }
 327
 328 static const struct seq_operations rt_cpu_seq_ops = {
 329         .start  = rt_cpu_seq_start,
 330         .next   = rt_cpu_seq_next,
 331         .stop   = rt_cpu_seq_stop,
 332         .show   = rt_cpu_seq_show,
 333 };
 334
 335
 336 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 337 {
 338         return seq_open(file, &rt_cpu_seq_ops);
 339 }
 340
 341 static const struct file_operations rt_cpu_seq_fops = {
 342         .owner   = THIS_MODULE,
 343         .open    = rt_cpu_seq_open,
 344         .read    = seq_read,
 345         .llseek  = seq_lseek,
 346         .release = seq_release,
 347 };
 348
 349 #ifdef CONFIG_IP_ROUTE_CLASSID
 350 static int rt_acct_proc_show(struct seq_file *m, void *v)
 351 {
 352         struct ip_rt_acct *dst, *src;
 353         unsigned int i, j;
 354
 355         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 356         if (!dst)
 357                 return -ENOMEM;
 358
 359         for_each_possible_cpu(i) {
 360                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 361                 for (j = 0; j < 256; j++) {
 362                         dst[j].o_bytes   += src[j].o_bytes;
 363                         dst[j].o_packets += src[j].o_packets;
 364                         dst[j].i_bytes   += src[j].i_bytes;
 365                         dst[j].i_packets += src[j].i_packets;
 366                 }
 367         }
 368
 369         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 370         kfree(dst);
 371         return 0;
 372 }
 373
 374 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 375 {
 376         return single_open(file, rt_acct_proc_show, NULL);
 377 }
 378
 379 static const struct file_operations rt_acct_proc_fops = {
 380         .owner          = THIS_MODULE,
 381         .open           = rt_acct_proc_open,
 382         .read           = seq_read,
 383         .llseek         = seq_lseek,
 384         .release        = single_release,
 385 };
 386 #endif
 387
 388 static int __net_init ip_rt_do_proc_init(struct net *net)
 389 {
 390         struct proc_dir_entry *pde;
 391
 392         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 393                         &rt_cache_seq_fops);
 394         if (!pde)
 395                 goto err1;
 396
 397         pde = proc_create("rt_cache", S_IRUGO,
 398                           net->proc_net_stat, &rt_cpu_seq_fops);
 399         if (!pde)
 400                 goto err2;
 401
 402 #ifdef CONFIG_IP_ROUTE_CLASSID
 403         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 404         if (!pde)
 405                 goto err3;
 406 #endif
 407         return 0;
 408
 409 #ifdef CONFIG_IP_ROUTE_CLASSID
 410 err3:
 411         remove_proc_entry("rt_cache", net->proc_net_stat);
 412 #endif
 413 err2:
 414         remove_proc_entry("rt_cache", net->proc_net);
 415 err1:
 416         return -ENOMEM;
 417 }
 418
 419 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 420 {
 421         remove_proc_entry("rt_cache", net->proc_net_stat);
 422         remove_proc_entry("rt_cache", net->proc_net);
 423 #ifdef CONFIG_IP_ROUTE_CLASSID
 424         remove_proc_entry("rt_acct", net->proc_net);
 425 #endif
 426 }
 427
 428 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 429         .init = ip_rt_do_proc_init,
 430         .exit = ip_rt_do_proc_exit,
 431 };
 432
 433 static int __init ip_rt_proc_init(void)
 434 {
 435         return register_pernet_subsys(&ip_rt_proc_ops);
 436 }
 437
 438 #else
 439 static inline int ip_rt_proc_init(void)
 440 {
 441         return 0;
 442 }
 443 #endif /* CONFIG_PROC_FS */
 444
 445 static inline bool rt_is_expired(const struct rtable *rth)
 446 {
 447         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 448 }
 449
 450 /*
 451  * Perturbation of rt_genid by a small quantity [1..256]
 452  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 453  * many times (2^24) without giving recent rt_genid.
 454  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 455  */
 456 static void rt_cache_invalidate(struct net *net)
 457 {
 458         unsigned char shuffle;
 459
 460         get_random_bytes(&shuffle, sizeof(shuffle));
 461         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 462 }
 463
 464 /*
 465  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 466  * delay >= 0 : invalidate & flush cache (can be long)
 467  */
 468 void rt_cache_flush(struct net *net, int delay)
 469 {
 470         rt_cache_invalidate(net);
 471 }
 472
 473 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 474                                            struct sk_buff *skb,
 475                                            const void *daddr)
 476 {
 477         struct net_device *dev = dst->dev;
 478         const __be32 *pkey = daddr;
 479         const struct rtable *rt;
 480         struct neighbour *n;
 481
 482         rt = (const struct rtable *) dst;
 483         if (rt->rt_gateway)
 484                 pkey = (const __be32 *) &rt->rt_gateway;
 485         else if (skb)
 486                 pkey = &ip_hdr(skb)->daddr;
 487
 488         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 489         if (n)
 490                 return n;
 491         return neigh_create(&arp_tbl, pkey, dev);
 492 }
 493
 494 /*
 495  * Peer allocation may fail only in serious out-of-memory conditions.  However
 496  * we still can generate some output.
 497  * Random ID selection looks a bit dangerous because we have no chances to
 498  * select ID being unique in a reasonable period of time.
 499  * But broken packet identifier may be better than no packet at all.
 500  */
 501 static void ip_select_fb_ident(struct iphdr *iph)
 502 {
 503         static DEFINE_SPINLOCK(ip_fb_id_lock);
 504         static u32 ip_fallback_id;
 505         u32 salt;
 506
 507         spin_lock_bh(&ip_fb_id_lock);
 508         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 509         iph->id = htons(salt & 0xFFFF);
 510         ip_fallback_id = salt;
 511         spin_unlock_bh(&ip_fb_id_lock);
 512 }
 513
 514 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 515 {
 516         struct net *net = dev_net(dst->dev);
 517         struct inet_peer *peer;
 518
 519         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 520         if (peer) {
 521                 iph->id = htons(inet_getid(peer, more));
 522                 inet_putpeer(peer);
 523                 return;
 524         }
 525
 526         ip_select_fb_ident(iph);
 527 }
 528 EXPORT_SYMBOL(__ip_select_ident);
 529
 530 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 531                              const struct iphdr *iph,
 532                              int oif, u8 tos,
 533                              u8 prot, u32 mark, int flow_flags)
 534 {
 535         if (sk) {
 536                 const struct inet_sock *inet = inet_sk(sk);
 537
 538                 oif = sk->sk_bound_dev_if;
 539                 mark = sk->sk_mark;
 540                 tos = RT_CONN_FLAGS(sk);
 541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 542         }
 543         flowi4_init_output(fl4, oif, mark, tos,
 544                            RT_SCOPE_UNIVERSE, prot,
 545                            flow_flags,
 546                            iph->daddr, iph->saddr, 0, 0);
 547 }
 548
 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550                                const struct sock *sk)
 551 {
 552         const struct iphdr *iph = ip_hdr(skb);
 553         int oif = skb->dev->ifindex;
 554         u8 tos = RT_TOS(iph->tos);
 555         u8 prot = iph->protocol;
 556         u32 mark = skb->mark;
 557
 558         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 559 }
 560
 561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 562 {
 563         const struct inet_sock *inet = inet_sk(sk);
 564         const struct ip_options_rcu *inet_opt;
 565         __be32 daddr = inet->inet_daddr;
 566
 567         rcu_read_lock();
 568         inet_opt = rcu_dereference(inet->inet_opt);
 569         if (inet_opt && inet_opt->opt.srr)
 570                 daddr = inet_opt->opt.faddr;
 571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 574                            inet_sk_flowi_flags(sk),
 575                            daddr, inet->inet_saddr, 0, 0);
 576         rcu_read_unlock();
 577 }
 578
 579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 580                                  const struct sk_buff *skb)
 581 {
 582         if (skb)
 583                 build_skb_flow_key(fl4, skb, sk);
 584         else
 585                 build_sk_flow_key(fl4, sk);
 586 }
 587
 588 static inline void rt_free(struct rtable *rt)
 589 {
 590         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 591 }
 592
 593 static DEFINE_SPINLOCK(fnhe_lock);
 594
 595 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 596 {
 597         struct fib_nh_exception *fnhe, *oldest;
 598         struct rtable *orig;
 599
 600         oldest = rcu_dereference(hash->chain);
 601         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 602              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 603                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 604                         oldest = fnhe;
 605         }
 606         orig = rcu_dereference(oldest->fnhe_rth);
 607         if (orig) {
 608                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 609                 rt_free(orig);
 610         }
 611         return oldest;
 612 }
 613
 614 static inline u32 fnhe_hashfun(__be32 daddr)
 615 {
 616         u32 hval;
 617
 618         hval = (__force u32) daddr;
 619         hval ^= (hval >> 11) ^ (hval >> 22);
 620
 621         return hval & (FNHE_HASH_SIZE - 1);
 622 }
 623
 624 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 625                                   u32 pmtu, unsigned long expires)
 626 {
 627         struct fnhe_hash_bucket *hash;
 628         struct fib_nh_exception *fnhe;
 629         int depth;
 630         u32 hval = fnhe_hashfun(daddr);
 631
 632         spin_lock_bh(&fnhe_lock);
 633
 634         hash = nh->nh_exceptions;
 635         if (!hash) {
 636                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 637                 if (!hash)
 638                         goto out_unlock;
 639                 nh->nh_exceptions = hash;
 640         }
 641
 642         hash += hval;
 643
 644         depth = 0;
 645         for (fnhe = rcu_dereference(hash->chain); fnhe;
 646              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 647                 if (fnhe->fnhe_daddr == daddr)
 648                         break;
 649                 depth++;
 650         }
 651
 652         if (fnhe) {
 653                 if (gw)
 654                         fnhe->fnhe_gw = gw;
 655                 if (pmtu) {
 656                         fnhe->fnhe_pmtu = pmtu;
 657                         fnhe->fnhe_expires = expires;
 658                 }
 659         } else {
 660                 if (depth > FNHE_RECLAIM_DEPTH)
 661                         fnhe = fnhe_oldest(hash);
 662                 else {
 663                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 664                         if (!fnhe)
 665                                 goto out_unlock;
 666
 667                         fnhe->fnhe_next = hash->chain;
 668                         rcu_assign_pointer(hash->chain, fnhe);
 669                 }
 670                 fnhe->fnhe_daddr = daddr;
 671                 fnhe->fnhe_gw = gw;
 672                 fnhe->fnhe_pmtu = pmtu;
 673                 fnhe->fnhe_expires = expires;
 674         }
 675
 676         fnhe->fnhe_stamp = jiffies;
 677
 678 out_unlock:
 679         spin_unlock_bh(&fnhe_lock);
 680         return;
 681 }
 682
 683 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 684                              bool kill_route)
 685 {
 686         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 687         __be32 old_gw = ip_hdr(skb)->saddr;
 688         struct net_device *dev = skb->dev;
 689         struct in_device *in_dev;
 690         struct fib_result res;
 691         struct neighbour *n;
 692         struct net *net;
 693
 694         switch (icmp_hdr(skb)->code & 7) {
 695         case ICMP_REDIR_NET:
 696         case ICMP_REDIR_NETTOS:
 697         case ICMP_REDIR_HOST:
 698         case ICMP_REDIR_HOSTTOS:
 699                 break;
 700
 701         default:
 702                 return;
 703         }
 704
 705         if (rt->rt_gateway != old_gw)
 706                 return;
 707
 708         in_dev = __in_dev_get_rcu(dev);
 709         if (!in_dev)
 710                 return;
 711
 712         net = dev_net(dev);
 713         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 714             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 715             ipv4_is_zeronet(new_gw))
 716                 goto reject_redirect;
 717
 718         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 719                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 720                         goto reject_redirect;
 721                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 722                         goto reject_redirect;
 723         } else {
 724                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 725                         goto reject_redirect;
 726         }
 727
 728         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 729         if (n) {
 730                 if (!(n->nud_state & NUD_VALID)) {
 731                         neigh_event_send(n, NULL);
 732                 } else {
 733                         if (fib_lookup(net, fl4, &res) == 0) {
 734                                 struct fib_nh *nh = &FIB_RES_NH(res);
 735
 736                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 737                                                       0, 0);
 738                         }
 739                         if (kill_route)
 740                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 741                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 742                 }
 743                 neigh_release(n);
 744         }
 745         return;
 746
 747 reject_redirect:
 748 #ifdef CONFIG_IP_ROUTE_VERBOSE
 749         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 750                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 751                 __be32 daddr = iph->daddr;
 752                 __be32 saddr = iph->saddr;
 753
 754                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 755                                      "  Advised path = %pI4 -> %pI4\n",
 756                                      &old_gw, dev->name, &new_gw,
 757                                      &saddr, &daddr);
 758         }
 759 #endif
 760         ;
 761 }
 762
 763 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 764 {
 765         struct rtable *rt;
 766         struct flowi4 fl4;
 767
 768         rt = (struct rtable *) dst;
 769
 770         ip_rt_build_flow_key(&fl4, sk, skb);
 771         __ip_do_redirect(rt, skb, &fl4, true);
 772 }
 773
 774 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 775 {
 776         struct rtable *rt = (struct rtable *)dst;
 777         struct dst_entry *ret = dst;
 778
 779         if (rt) {
 780                 if (dst->obsolete > 0) {
 781                         ip_rt_put(rt);
 782                         ret = NULL;
 783                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 784                            rt->dst.expires) {
 785                         ip_rt_put(rt);
 786                         ret = NULL;
 787                 }
 788         }
 789         return ret;
 790 }
 791
 792 /*
 793  * Algorithm:
 794  *      1. The first ip_rt_redirect_number redirects are sent
 795  *         with exponential backoff, then we stop sending them at all,
 796  *         assuming that the host ignores our redirects.
 797  *      2. If we did not see packets requiring redirects
 798  *         during ip_rt_redirect_silence, we assume that the host
 799  *         forgot redirected route and start to send redirects again.
 800  *
 801  * This algorithm is much cheaper and more intelligent than dumb load limiting
 802  * in icmp.c.
 803  *
 804  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 805  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 806  */
 807
 808 void ip_rt_send_redirect(struct sk_buff *skb)
 809 {
 810         struct rtable *rt = skb_rtable(skb);
 811         struct in_device *in_dev;
 812         struct inet_peer *peer;
 813         struct net *net;
 814         int log_martians;
 815
 816         rcu_read_lock();
 817         in_dev = __in_dev_get_rcu(rt->dst.dev);
 818         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 819                 rcu_read_unlock();
 820                 return;
 821         }
 822         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 823         rcu_read_unlock();
 824
 825         net = dev_net(rt->dst.dev);
 826         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 827         if (!peer) {
 828                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 829                 return;
 830         }
 831
 832         /* No redirected packets during ip_rt_redirect_silence;
 833          * reset the algorithm.
 834          */
 835         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 836                 peer->rate_tokens = 0;
 837
 838         /* Too many ignored redirects; do not send anything
 839          * set dst.rate_last to the last seen redirected packet.
 840          */
 841         if (peer->rate_tokens >= ip_rt_redirect_number) {
 842                 peer->rate_last = jiffies;
 843                 goto out_put_peer;
 844         }
 845
 846         /* Check for load limit; set rate_last to the latest sent
 847          * redirect.
 848          */
 849         if (peer->rate_tokens == 0 ||
 850             time_after(jiffies,
 851                        (peer->rate_last +
 852                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 853                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 854                 peer->rate_last = jiffies;
 855                 ++peer->rate_tokens;
 856 #ifdef CONFIG_IP_ROUTE_VERBOSE
 857                 if (log_martians &&
 858                     peer->rate_tokens == ip_rt_redirect_number)
 859                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 860                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 861                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
 862 #endif
 863         }
 864 out_put_peer:
 865         inet_putpeer(peer);
 866 }
 867
 868 static int ip_error(struct sk_buff *skb)
 869 {
 870         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 871         struct rtable *rt = skb_rtable(skb);
 872         struct inet_peer *peer;
 873         unsigned long now;
 874         struct net *net;
 875         bool send;
 876         int code;
 877
 878         net = dev_net(rt->dst.dev);
 879         if (!IN_DEV_FORWARD(in_dev)) {
 880                 switch (rt->dst.error) {
 881                 case EHOSTUNREACH:
 882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 883                         break;
 884
 885                 case ENETUNREACH:
 886                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 887                         break;
 888                 }
 889                 goto out;
 890         }
 891
 892         switch (rt->dst.error) {
 893         case EINVAL:
 894         default:
 895                 goto out;
 896         case EHOSTUNREACH:
 897                 code = ICMP_HOST_UNREACH;
 898                 break;
 899         case ENETUNREACH:
 900                 code = ICMP_NET_UNREACH;
 901                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 902                 break;
 903         case EACCES:
 904                 code = ICMP_PKT_FILTERED;
 905                 break;
 906         }
 907
 908         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 909
 910         send = true;
 911         if (peer) {
 912                 now = jiffies;
 913                 peer->rate_tokens += now - peer->rate_last;
 914                 if (peer->rate_tokens > ip_rt_error_burst)
 915                         peer->rate_tokens = ip_rt_error_burst;
 916                 peer->rate_last = now;
 917                 if (peer->rate_tokens >= ip_rt_error_cost)
 918                         peer->rate_tokens -= ip_rt_error_cost;
 919                 else
 920                         send = false;
 921                 inet_putpeer(peer);
 922         }
 923         if (send)
 924                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 925
 926 out:    kfree_skb(skb);
 927         return 0;
 928 }
 929
 930 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 931 {
 932         struct fib_result res;
 933
 934         if (mtu < ip_rt_min_pmtu)
 935                 mtu = ip_rt_min_pmtu;
 936
 937         rcu_read_lock();
 938         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 939                 struct fib_nh *nh = &FIB_RES_NH(res);
 940
 941                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 942                                       jiffies + ip_rt_mtu_expires);
 943         }
 944         rcu_read_unlock();
 945         return mtu;
 946 }
 947
 948 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 949                               struct sk_buff *skb, u32 mtu)
 950 {
 951         struct rtable *rt = (struct rtable *) dst;
 952         struct flowi4 fl4;
 953
 954         ip_rt_build_flow_key(&fl4, sk, skb);
 955         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
 956
 957         if (!rt->rt_pmtu) {
 958                 dst->obsolete = DST_OBSOLETE_KILL;
 959         } else {
 960                 rt->rt_pmtu = mtu;
 961                 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
 962         }
 963 }
 964
 965 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 966                       int oif, u32 mark, u8 protocol, int flow_flags)
 967 {
 968         const struct iphdr *iph = (const struct iphdr *) skb->data;
 969         struct flowi4 fl4;
 970         struct rtable *rt;
 971
 972         __build_flow_key(&fl4, NULL, iph, oif,
 973                          RT_TOS(iph->tos), protocol, mark, flow_flags);
 974         rt = __ip_route_output_key(net, &fl4);
 975         if (!IS_ERR(rt)) {
 976                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 977                 ip_rt_put(rt);
 978         }
 979 }
 980 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 981
 982 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 983 {
 984         const struct iphdr *iph = (const struct iphdr *) skb->data;
 985         struct flowi4 fl4;
 986         struct rtable *rt;
 987
 988         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 989         rt = __ip_route_output_key(sock_net(sk), &fl4);
 990         if (!IS_ERR(rt)) {
 991                 __ip_rt_update_pmtu(rt, &fl4, mtu);
 992                 ip_rt_put(rt);
 993         }
 994 }
 995 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 996
 997 void ipv4_redirect(struct sk_buff *skb, struct net *net,
 998                    int oif, u32 mark, u8 protocol, int flow_flags)
 999 {
1000         const struct iphdr *iph = (const struct iphdr *) skb->data;
1001         struct flowi4 fl4;
1002         struct rtable *rt;
1003
1004         __build_flow_key(&fl4, NULL, iph, oif,
1005                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1006         rt = __ip_route_output_key(net, &fl4);
1007         if (!IS_ERR(rt)) {
1008                 __ip_do_redirect(rt, skb, &fl4, false);
1009                 ip_rt_put(rt);
1010         }
1011 }
1012 EXPORT_SYMBOL_GPL(ipv4_redirect);
1013
1014 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1015 {
1016         const struct iphdr *iph = (const struct iphdr *) skb->data;
1017         struct flowi4 fl4;
1018         struct rtable *rt;
1019
1020         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1021         rt = __ip_route_output_key(sock_net(sk), &fl4);
1022         if (!IS_ERR(rt)) {
1023                 __ip_do_redirect(rt, skb, &fl4, false);
1024                 ip_rt_put(rt);
1025         }
1026 }
1027 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1028
1029 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1030 {
1031         struct rtable *rt = (struct rtable *) dst;
1032
1033         /* All IPV4 dsts are created with ->obsolete set to the value
1034          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1035          * into this function always.
1036          *
1037          * When a PMTU/redirect information update invalidates a
1038          * route, this is indicated by setting obsolete to
1039          * DST_OBSOLETE_KILL.
1040          */
1041         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1042                 return NULL;
1043         return dst;
1044 }
1045
1046 static void ipv4_link_failure(struct sk_buff *skb)
1047 {
1048         struct rtable *rt;
1049
1050         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1051
1052         rt = skb_rtable(skb);
1053         if (rt)
1054                 dst_set_expires(&rt->dst, 0);
1055 }
1056
1057 static int ip_rt_bug(struct sk_buff *skb)
1058 {
1059         pr_debug("%s: %pI4 -> %pI4, %s\n",
1060                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1061                  skb->dev ? skb->dev->name : "?");
1062         kfree_skb(skb);
1063         WARN_ON(1);
1064         return 0;
1065 }
1066
1067 /*
1068    We do not cache source address of outgoing interface,
1069    because it is used only by IP RR, TS and SRR options,
1070    so that it out of fast path.
1071
1072    BTW remember: "addr" is allowed to be not aligned
1073    in IP options!
1074  */
1075
1076 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1077 {
1078         __be32 src;
1079
1080         if (rt_is_output_route(rt))
1081                 src = ip_hdr(skb)->saddr;
1082         else {
1083                 struct fib_result res;
1084                 struct flowi4 fl4;
1085                 struct iphdr *iph;
1086
1087                 iph = ip_hdr(skb);
1088
1089                 memset(&fl4, 0, sizeof(fl4));
1090                 fl4.daddr = iph->daddr;
1091                 fl4.saddr = iph->saddr;
1092                 fl4.flowi4_tos = RT_TOS(iph->tos);
1093                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1094                 fl4.flowi4_iif = skb->dev->ifindex;
1095                 fl4.flowi4_mark = skb->mark;
1096
1097                 rcu_read_lock();
1098                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1099                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1100                 else
1101                         src = inet_select_addr(rt->dst.dev,
1102                                                rt_nexthop(rt, iph->daddr),
1103                                                RT_SCOPE_UNIVERSE);
1104                 rcu_read_unlock();
1105         }
1106         memcpy(addr, &src, 4);
1107 }
1108
1109 #ifdef CONFIG_IP_ROUTE_CLASSID
1110 static void set_class_tag(struct rtable *rt, u32 tag)
1111 {
1112         if (!(rt->dst.tclassid & 0xFFFF))
1113                 rt->dst.tclassid |= tag & 0xFFFF;
1114         if (!(rt->dst.tclassid & 0xFFFF0000))
1115                 rt->dst.tclassid |= tag & 0xFFFF0000;
1116 }
1117 #endif
1118
1119 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1120 {
1121         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1122
1123         if (advmss == 0) {
1124                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1125                                ip_rt_min_advmss);
1126                 if (advmss > 65535 - 40)
1127                         advmss = 65535 - 40;
1128         }
1129         return advmss;
1130 }
1131
1132 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1133 {
1134         const struct rtable *rt = (const struct rtable *) dst;
1135         unsigned int mtu = rt->rt_pmtu;
1136
1137         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1138                 mtu = 0;
1139
1140         if (!mtu)
1141                 mtu = dst_metric_raw(dst, RTAX_MTU);
1142
1143         if (mtu && rt_is_output_route(rt))
1144                 return mtu;
1145
1146         mtu = dst->dev->mtu;
1147
1148         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1149                 if (rt->rt_gateway && mtu > 576)
1150                         mtu = 576;
1151         }
1152
1153         if (mtu > IP_MAX_MTU)
1154                 mtu = IP_MAX_MTU;
1155
1156         return mtu;
1157 }
1158
1159 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1160 {
1161         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1162         struct fib_nh_exception *fnhe;
1163         u32 hval;
1164
1165         if (!hash)
1166                 return NULL;
1167
1168         hval = fnhe_hashfun(daddr);
1169
1170         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1171              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1172                 if (fnhe->fnhe_daddr == daddr)
1173                         return fnhe;
1174         }
1175         return NULL;
1176 }
1177
1178 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1179                               __be32 daddr)
1180 {
1181         bool ret = false;
1182
1183         spin_lock_bh(&fnhe_lock);
1184
1185         if (daddr == fnhe->fnhe_daddr) {
1186                 struct rtable *orig;
1187
1188                 if (fnhe->fnhe_pmtu) {
1189                         unsigned long expires = fnhe->fnhe_expires;
1190                         unsigned long diff = expires - jiffies;
1191
1192                         if (time_before(jiffies, expires)) {
1193                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1194                                 dst_set_expires(&rt->dst, diff);
1195                         }
1196                 }
1197                 if (fnhe->fnhe_gw) {
1198                         rt->rt_flags |= RTCF_REDIRECTED;
1199                         rt->rt_gateway = fnhe->fnhe_gw;
1200                 }
1201
1202                 orig = rcu_dereference(fnhe->fnhe_rth);
1203                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1204                 if (orig)
1205                         rt_free(orig);
1206
1207                 fnhe->fnhe_stamp = jiffies;
1208                 ret = true;
1209         } else {
1210                 /* Routes we intend to cache in nexthop exception have
1211                  * the DST_NOCACHE bit clear.  However, if we are
1212                  * unsuccessful at storing this route into the cache
1213                  * we really need to set it.
1214                  */
1215                 rt->dst.flags |= DST_NOCACHE;
1216         }
1217         spin_unlock_bh(&fnhe_lock);
1218
1219         return ret;
1220 }
1221
1222 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1223 {
1224         struct rtable *orig, *prev, **p;
1225         bool ret = true;
1226
1227         if (rt_is_input_route(rt)) {
1228                 p = (struct rtable **)&nh->nh_rth_input;
1229         } else {
1230                 if (!nh->nh_pcpu_rth_output)
1231                         goto nocache;
1232                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1233         }
1234         orig = *p;
1235
1236         prev = cmpxchg(p, orig, rt);
1237         if (prev == orig) {
1238                 if (orig)
1239                         rt_free(orig);
1240         } else {
1241                 /* Routes we intend to cache in the FIB nexthop have
1242                  * the DST_NOCACHE bit clear.  However, if we are
1243                  * unsuccessful at storing this route into the cache
1244                  * we really need to set it.
1245                  */
1246 nocache:
1247                 rt->dst.flags |= DST_NOCACHE;
1248                 ret = false;
1249         }
1250
1251         return ret;
1252 }
1253
1254 static DEFINE_SPINLOCK(rt_uncached_lock);
1255 static LIST_HEAD(rt_uncached_list);
1256
1257 static void rt_add_uncached_list(struct rtable *rt)
1258 {
1259         spin_lock_bh(&rt_uncached_lock);
1260         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1261         spin_unlock_bh(&rt_uncached_lock);
1262 }
1263
1264 static void ipv4_dst_destroy(struct dst_entry *dst)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (!list_empty(&rt->rt_uncached)) {
1269                 spin_lock_bh(&rt_uncached_lock);
1270                 list_del(&rt->rt_uncached);
1271                 spin_unlock_bh(&rt_uncached_lock);
1272         }
1273 }
1274
1275 void rt_flush_dev(struct net_device *dev)
1276 {
1277         if (!list_empty(&rt_uncached_list)) {
1278                 struct net *net = dev_net(dev);
1279                 struct rtable *rt;
1280
1281                 spin_lock_bh(&rt_uncached_lock);
1282                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1283                         if (rt->dst.dev != dev)
1284                                 continue;
1285                         rt->dst.dev = net->loopback_dev;
1286                         dev_hold(rt->dst.dev);
1287                         dev_put(dev);
1288                 }
1289                 spin_unlock_bh(&rt_uncached_lock);
1290         }
1291 }
1292
1293 static bool rt_cache_valid(const struct rtable *rt)
1294 {
1295         return  rt &&
1296                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1297                 !rt_is_expired(rt);
1298 }
1299
1300 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1301                            const struct fib_result *res,
1302                            struct fib_nh_exception *fnhe,
1303                            struct fib_info *fi, u16 type, u32 itag)
1304 {
1305         bool cached = false;
1306
1307         if (fi) {
1308                 struct fib_nh *nh = &FIB_RES_NH(*res);
1309
1310                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1311                         rt->rt_gateway = nh->nh_gw;
1312                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1313 #ifdef CONFIG_IP_ROUTE_CLASSID
1314                 rt->dst.tclassid = nh->nh_tclassid;
1315 #endif
1316                 if (unlikely(fnhe))
1317                         cached = rt_bind_exception(rt, fnhe, daddr);
1318                 else if (!(rt->dst.flags & DST_NOCACHE))
1319                         cached = rt_cache_route(nh, rt);
1320         }
1321         if (unlikely(!cached))
1322                 rt_add_uncached_list(rt);
1323
1324 #ifdef CONFIG_IP_ROUTE_CLASSID
1325 #ifdef CONFIG_IP_MULTIPLE_TABLES
1326         set_class_tag(rt, res->tclassid);
1327 #endif
1328         set_class_tag(rt, itag);
1329 #endif
1330 }
1331
1332 static struct rtable *rt_dst_alloc(struct net_device *dev,
1333                                    bool nopolicy, bool noxfrm, bool will_cache)
1334 {
1335         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1336                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1337                          (nopolicy ? DST_NOPOLICY : 0) |
1338                          (noxfrm ? DST_NOXFRM : 0));
1339 }
1340
1341 /* called in rcu_read_lock() section */
1342 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1343                                 u8 tos, struct net_device *dev, int our)
1344 {
1345         struct rtable *rth;
1346         struct in_device *in_dev = __in_dev_get_rcu(dev);
1347         u32 itag = 0;
1348         int err;
1349
1350         /* Primary sanity checks. */
1351
1352         if (in_dev == NULL)
1353                 return -EINVAL;
1354
1355         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1356             skb->protocol != htons(ETH_P_IP))
1357                 goto e_inval;
1358
1359         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1360                 if (ipv4_is_loopback(saddr))
1361                         goto e_inval;
1362
1363         if (ipv4_is_zeronet(saddr)) {
1364                 if (!ipv4_is_local_multicast(daddr))
1365                         goto e_inval;
1366         } else {
1367                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1368                                           in_dev, &itag);
1369                 if (err < 0)
1370                         goto e_err;
1371         }
1372         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1373                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1374         if (!rth)
1375                 goto e_nobufs;
1376
1377 #ifdef CONFIG_IP_ROUTE_CLASSID
1378         rth->dst.tclassid = itag;
1379 #endif
1380         rth->dst.output = ip_rt_bug;
1381
1382         rth->rt_genid   = rt_genid(dev_net(dev));
1383         rth->rt_flags   = RTCF_MULTICAST;
1384         rth->rt_type    = RTN_MULTICAST;
1385         rth->rt_is_input= 1;
1386         rth->rt_iif     = 0;
1387         rth->rt_pmtu    = 0;
1388         rth->rt_gateway = 0;
1389         INIT_LIST_HEAD(&rth->rt_uncached);
1390         if (our) {
1391                 rth->dst.input= ip_local_deliver;
1392                 rth->rt_flags |= RTCF_LOCAL;
1393         }
1394
1395 #ifdef CONFIG_IP_MROUTE
1396         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1397                 rth->dst.input = ip_mr_input;
1398 #endif
1399         RT_CACHE_STAT_INC(in_slow_mc);
1400
1401         skb_dst_set(skb, &rth->dst);
1402         return 0;
1403
1404 e_nobufs:
1405         return -ENOBUFS;
1406 e_inval:
1407         return -EINVAL;
1408 e_err:
1409         return err;
1410 }
1411
1412
1413 static void ip_handle_martian_source(struct net_device *dev,
1414                                      struct in_device *in_dev,
1415                                      struct sk_buff *skb,
1416                                      __be32 daddr,
1417                                      __be32 saddr)
1418 {
1419         RT_CACHE_STAT_INC(in_martian_src);
1420 #ifdef CONFIG_IP_ROUTE_VERBOSE
1421         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1422                 /*
1423                  *      RFC1812 recommendation, if source is martian,
1424                  *      the only hint is MAC header.
1425                  */
1426                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1427                         &daddr, &saddr, dev->name);
1428                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1429                         print_hex_dump(KERN_WARNING, "ll header: ",
1430                                        DUMP_PREFIX_OFFSET, 16, 1,
1431                                        skb_mac_header(skb),
1432                                        dev->hard_header_len, true);
1433                 }
1434         }
1435 #endif
1436 }
1437
1438 /* called in rcu_read_lock() section */
1439 static int __mkroute_input(struct sk_buff *skb,
1440                            const struct fib_result *res,
1441                            struct in_device *in_dev,
1442                            __be32 daddr, __be32 saddr, u32 tos)
1443 {
1444         struct rtable *rth;
1445         int err;
1446         struct in_device *out_dev;
1447         unsigned int flags = 0;
1448         bool do_cache;
1449         u32 itag;
1450
1451         /* get a working reference to the output device */
1452         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1453         if (out_dev == NULL) {
1454                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1455                 return -EINVAL;
1456         }
1457
1458
1459         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1460                                   in_dev->dev, in_dev, &itag);
1461         if (err < 0) {
1462                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1463                                          saddr);
1464
1465                 goto cleanup;
1466         }
1467
1468         if (out_dev == in_dev && err &&
1469             (IN_DEV_SHARED_MEDIA(out_dev) ||
1470              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1471                 flags |= RTCF_DOREDIRECT;
1472
1473         if (skb->protocol != htons(ETH_P_IP)) {
1474                 /* Not IP (i.e. ARP). Do not create route, if it is
1475                  * invalid for proxy arp. DNAT routes are always valid.
1476                  *
1477                  * Proxy arp feature have been extended to allow, ARP
1478                  * replies back to the same interface, to support
1479                  * Private VLAN switch technologies. See arp.c.
1480                  */
1481                 if (out_dev == in_dev &&
1482                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1483                         err = -EINVAL;
1484                         goto cleanup;
1485                 }
1486         }
1487
1488         do_cache = false;
1489         if (res->fi) {
1490                 if (!itag) {
1491                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1492                         if (rt_cache_valid(rth)) {
1493                                 skb_dst_set_noref(skb, &rth->dst);
1494                                 goto out;
1495                         }
1496                         do_cache = true;
1497                 }
1498         }
1499
1500         rth = rt_dst_alloc(out_dev->dev,
1501                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1502                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1503         if (!rth) {
1504                 err = -ENOBUFS;
1505                 goto cleanup;
1506         }
1507
1508         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1509         rth->rt_flags = flags;
1510         rth->rt_type = res->type;
1511         rth->rt_is_input = 1;
1512         rth->rt_iif     = 0;
1513         rth->rt_pmtu    = 0;
1514         rth->rt_gateway = 0;
1515         INIT_LIST_HEAD(&rth->rt_uncached);
1516
1517         rth->dst.input = ip_forward;
1518         rth->dst.output = ip_output;
1519
1520         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1521         skb_dst_set(skb, &rth->dst);
1522 out:
1523         err = 0;
1524  cleanup:
1525         return err;
1526 }
1527
1528 static int ip_mkroute_input(struct sk_buff *skb,
1529                             struct fib_result *res,
1530                             const struct flowi4 *fl4,
1531                             struct in_device *in_dev,
1532                             __be32 daddr, __be32 saddr, u32 tos)
1533 {
1534 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1535         if (res->fi && res->fi->fib_nhs > 1)
1536                 fib_select_multipath(res);
1537 #endif
1538
1539         /* create a routing cache entry */
1540         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1541 }
1542
1543 /*
1544  *      NOTE. We drop all the packets that has local source
1545  *      addresses, because every properly looped back packet
1546  *      must have correct destination already attached by output routine.
1547  *
1548  *      Such approach solves two big problems:
1549  *      1. Not simplex devices are handled properly.
1550  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1551  *      called with rcu_read_lock()
1552  */
1553
1554 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1555                                u8 tos, struct net_device *dev)
1556 {
1557         struct fib_result res;
1558         struct in_device *in_dev = __in_dev_get_rcu(dev);
1559         struct flowi4   fl4;
1560         unsigned int    flags = 0;
1561         u32             itag = 0;
1562         struct rtable   *rth;
1563         int             err = -EINVAL;
1564         struct net    *net = dev_net(dev);
1565         bool do_cache;
1566
1567         /* IP on this device is disabled. */
1568
1569         if (!in_dev)
1570                 goto out;
1571
1572         /* Check for the most weird martians, which can be not detected
1573            by fib_lookup.
1574          */
1575
1576         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1577                 goto martian_source;
1578
1579         res.fi = NULL;
1580         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1581                 goto brd_input;
1582
1583         /* Accept zero addresses only to limited broadcast;
1584          * I even do not know to fix it or not. Waiting for complains :-)
1585          */
1586         if (ipv4_is_zeronet(saddr))
1587                 goto martian_source;
1588
1589         if (ipv4_is_zeronet(daddr))
1590                 goto martian_destination;
1591
1592         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1593          * and call it once if daddr or/and saddr are loopback addresses
1594          */
1595         if (ipv4_is_loopback(daddr)) {
1596                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1597                         goto martian_destination;
1598         } else if (ipv4_is_loopback(saddr)) {
1599                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1600                         goto martian_source;
1601         }
1602
1603         /*
1604          *      Now we are ready to route packet.
1605          */
1606         fl4.flowi4_oif = 0;
1607         fl4.flowi4_iif = dev->ifindex;
1608         fl4.flowi4_mark = skb->mark;
1609         fl4.flowi4_tos = tos;
1610         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1611         fl4.daddr = daddr;
1612         fl4.saddr = saddr;
1613         err = fib_lookup(net, &fl4, &res);
1614         if (err != 0)
1615                 goto no_route;
1616
1617         RT_CACHE_STAT_INC(in_slow_tot);
1618
1619         if (res.type == RTN_BROADCAST)
1620                 goto brd_input;
1621
1622         if (res.type == RTN_LOCAL) {
1623                 err = fib_validate_source(skb, saddr, daddr, tos,
1624                                           LOOPBACK_IFINDEX,
1625                                           dev, in_dev, &itag);
1626                 if (err < 0)
1627                         goto martian_source_keep_err;
1628                 goto local_input;
1629         }
1630
1631         if (!IN_DEV_FORWARD(in_dev))
1632                 goto no_route;
1633         if (res.type != RTN_UNICAST)
1634                 goto martian_destination;
1635
1636         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1637 out:    return err;
1638
1639 brd_input:
1640         if (skb->protocol != htons(ETH_P_IP))
1641                 goto e_inval;
1642
1643         if (!ipv4_is_zeronet(saddr)) {
1644                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1645                                           in_dev, &itag);
1646                 if (err < 0)
1647                         goto martian_source_keep_err;
1648         }
1649         flags |= RTCF_BROADCAST;
1650         res.type = RTN_BROADCAST;
1651         RT_CACHE_STAT_INC(in_brd);
1652
1653 local_input:
1654         do_cache = false;
1655         if (res.fi) {
1656                 if (!itag) {
1657                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1658                         if (rt_cache_valid(rth)) {
1659                                 skb_dst_set_noref(skb, &rth->dst);
1660                                 err = 0;
1661                                 goto out;
1662                         }
1663                         do_cache = true;
1664                 }
1665         }
1666
1667         rth = rt_dst_alloc(net->loopback_dev,
1668                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1669         if (!rth)
1670                 goto e_nobufs;
1671
1672         rth->dst.input= ip_local_deliver;
1673         rth->dst.output= ip_rt_bug;
1674 #ifdef CONFIG_IP_ROUTE_CLASSID
1675         rth->dst.tclassid = itag;
1676 #endif
1677
1678         rth->rt_genid = rt_genid(net);
1679         rth->rt_flags   = flags|RTCF_LOCAL;
1680         rth->rt_type    = res.type;
1681         rth->rt_is_input = 1;
1682         rth->rt_iif     = 0;
1683         rth->rt_pmtu    = 0;
1684         rth->rt_gateway = 0;
1685         INIT_LIST_HEAD(&rth->rt_uncached);
1686         if (res.type == RTN_UNREACHABLE) {
1687                 rth->dst.input= ip_error;
1688                 rth->dst.error= -err;
1689                 rth->rt_flags   &= ~RTCF_LOCAL;
1690         }
1691         if (do_cache)
1692                 rt_cache_route(&FIB_RES_NH(res), rth);
1693         skb_dst_set(skb, &rth->dst);
1694         err = 0;
1695         goto out;
1696
1697 no_route:
1698         RT_CACHE_STAT_INC(in_no_route);
1699         res.type = RTN_UNREACHABLE;
1700         if (err == -ESRCH)
1701                 err = -ENETUNREACH;
1702         goto local_input;
1703
1704         /*
1705          *      Do not cache martian addresses: they should be logged (RFC1812)
1706          */
1707 martian_destination:
1708         RT_CACHE_STAT_INC(in_martian_dst);
1709 #ifdef CONFIG_IP_ROUTE_VERBOSE
1710         if (IN_DEV_LOG_MARTIANS(in_dev))
1711                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1712                                      &daddr, &saddr, dev->name);
1713 #endif
1714
1715 e_inval:
1716         err = -EINVAL;
1717         goto out;
1718
1719 e_nobufs:
1720         err = -ENOBUFS;
1721         goto out;
1722
1723 martian_source:
1724         err = -EINVAL;
1725 martian_source_keep_err:
1726         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1727         goto out;
1728 }
1729
1730 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1731                          u8 tos, struct net_device *dev)
1732 {
1733         int res;
1734
1735         rcu_read_lock();
1736
1737         /* Multicast recognition logic is moved from route cache to here.
1738            The problem was that too many Ethernet cards have broken/missing
1739            hardware multicast filters :-( As result the host on multicasting
1740            network acquires a lot of useless route cache entries, sort of
1741            SDR messages from all the world. Now we try to get rid of them.
1742            Really, provided software IP multicast filter is organized
1743            reasonably (at least, hashed), it does not result in a slowdown
1744            comparing with route cache reject entries.
1745            Note, that multicast routers are not affected, because
1746            route cache entry is created eventually.
1747          */
1748         if (ipv4_is_multicast(daddr)) {
1749                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1750
1751                 if (in_dev) {
1752                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1753                                                   ip_hdr(skb)->protocol);
1754                         if (our
1755 #ifdef CONFIG_IP_MROUTE
1756                                 ||
1757                             (!ipv4_is_local_multicast(daddr) &&
1758                              IN_DEV_MFORWARD(in_dev))
1759 #endif
1760                            ) {
1761                                 int res = ip_route_input_mc(skb, daddr, saddr,
1762                                                             tos, dev, our);
1763                                 rcu_read_unlock();
1764                                 return res;
1765                         }
1766                 }
1767                 rcu_read_unlock();
1768                 return -EINVAL;
1769         }
1770         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1771         rcu_read_unlock();
1772         return res;
1773 }
1774 EXPORT_SYMBOL(ip_route_input_noref);
1775
1776 /* called with rcu_read_lock() */
1777 static struct rtable *__mkroute_output(const struct fib_result *res,
1778                                        const struct flowi4 *fl4, int orig_oif,
1779                                        struct net_device *dev_out,
1780                                        unsigned int flags)
1781 {
1782         struct fib_info *fi = res->fi;
1783         struct fib_nh_exception *fnhe;
1784         struct in_device *in_dev;
1785         u16 type = res->type;
1786         struct rtable *rth;
1787
1788         in_dev = __in_dev_get_rcu(dev_out);
1789         if (!in_dev)
1790                 return ERR_PTR(-EINVAL);
1791
1792         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1793                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1794                         return ERR_PTR(-EINVAL);
1795
1796         if (ipv4_is_lbcast(fl4->daddr))
1797                 type = RTN_BROADCAST;
1798         else if (ipv4_is_multicast(fl4->daddr))
1799                 type = RTN_MULTICAST;
1800         else if (ipv4_is_zeronet(fl4->daddr))
1801                 return ERR_PTR(-EINVAL);
1802
1803         if (dev_out->flags & IFF_LOOPBACK)
1804                 flags |= RTCF_LOCAL;
1805
1806         if (type == RTN_BROADCAST) {
1807                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1808                 fi = NULL;
1809         } else if (type == RTN_MULTICAST) {
1810                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1811                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1812                                      fl4->flowi4_proto))
1813                         flags &= ~RTCF_LOCAL;
1814                 /* If multicast route do not exist use
1815                  * default one, but do not gateway in this case.
1816                  * Yes, it is hack.
1817                  */
1818                 if (fi && res->prefixlen < 4)
1819                         fi = NULL;
1820         }
1821
1822         fnhe = NULL;
1823         if (fi) {
1824                 struct rtable __rcu **prth;
1825
1826                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1827                 if (fnhe)
1828                         prth = &fnhe->fnhe_rth;
1829                 else
1830                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1831                 rth = rcu_dereference(*prth);
1832                 if (rt_cache_valid(rth)) {
1833                         dst_hold(&rth->dst);
1834                         return rth;
1835                 }
1836         }
1837         rth = rt_dst_alloc(dev_out,
1838                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1839                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1840                            fi);
1841         if (!rth)
1842                 return ERR_PTR(-ENOBUFS);
1843
1844         rth->dst.output = ip_output;
1845
1846         rth->rt_genid = rt_genid(dev_net(dev_out));
1847         rth->rt_flags   = flags;
1848         rth->rt_type    = type;
1849         rth->rt_is_input = 0;
1850         rth->rt_iif     = orig_oif ? : 0;
1851         rth->rt_pmtu    = 0;
1852         rth->rt_gateway = 0;
1853         INIT_LIST_HEAD(&rth->rt_uncached);
1854
1855         RT_CACHE_STAT_INC(out_slow_tot);
1856
1857         if (flags & RTCF_LOCAL)
1858                 rth->dst.input = ip_local_deliver;
1859         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1860                 if (flags & RTCF_LOCAL &&
1861                     !(dev_out->flags & IFF_LOOPBACK)) {
1862                         rth->dst.output = ip_mc_output;
1863                         RT_CACHE_STAT_INC(out_slow_mc);
1864                 }
1865 #ifdef CONFIG_IP_MROUTE
1866                 if (type == RTN_MULTICAST) {
1867                         if (IN_DEV_MFORWARD(in_dev) &&
1868                             !ipv4_is_local_multicast(fl4->daddr)) {
1869                                 rth->dst.input = ip_mr_input;
1870                                 rth->dst.output = ip_mc_output;
1871                         }
1872                 }
1873 #endif
1874         }
1875
1876         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1877
1878         return rth;
1879 }
1880
1881 /*
1882  * Major route resolver routine.
1883  */
1884
1885 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1886 {
1887         struct net_device *dev_out = NULL;
1888         __u8 tos = RT_FL_TOS(fl4);
1889         unsigned int flags = 0;
1890         struct fib_result res;
1891         struct rtable *rth;
1892         int orig_oif;
1893
1894         res.tclassid    = 0;
1895         res.fi          = NULL;
1896         res.table       = NULL;
1897
1898         orig_oif = fl4->flowi4_oif;
1899
1900         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1901         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1902         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1903                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1904
1905         rcu_read_lock();
1906         if (fl4->saddr) {
1907                 rth = ERR_PTR(-EINVAL);
1908                 if (ipv4_is_multicast(fl4->saddr) ||
1909                     ipv4_is_lbcast(fl4->saddr) ||
1910                     ipv4_is_zeronet(fl4->saddr))
1911                         goto out;
1912
1913                 /* I removed check for oif == dev_out->oif here.
1914                    It was wrong for two reasons:
1915                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1916                       is assigned to multiple interfaces.
1917                    2. Moreover, we are allowed to send packets with saddr
1918                       of another iface. --ANK
1919                  */
1920
1921                 if (fl4->flowi4_oif == 0 &&
1922                     (ipv4_is_multicast(fl4->daddr) ||
1923                      ipv4_is_lbcast(fl4->daddr))) {
1924                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1925                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1926                         if (dev_out == NULL)
1927                                 goto out;
1928
1929                         /* Special hack: user can direct multicasts
1930                            and limited broadcast via necessary interface
1931                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1932                            This hack is not just for fun, it allows
1933                            vic,vat and friends to work.
1934                            They bind socket to loopback, set ttl to zero
1935                            and expect that it will work.
1936                            From the viewpoint of routing cache they are broken,
1937                            because we are not allowed to build multicast path
1938                            with loopback source addr (look, routing cache
1939                            cannot know, that ttl is zero, so that packet
1940                            will not leave this host and route is valid).
1941                            Luckily, this hack is good workaround.
1942                          */
1943
1944                         fl4->flowi4_oif = dev_out->ifindex;
1945                         goto make_route;
1946                 }
1947
1948                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1949                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1950                         if (!__ip_dev_find(net, fl4->saddr, false))
1951                                 goto out;
1952                 }
1953         }
1954
1955
1956         if (fl4->flowi4_oif) {
1957                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1958                 rth = ERR_PTR(-ENODEV);
1959                 if (dev_out == NULL)
1960                         goto out;
1961
1962                 /* RACE: Check return value of inet_select_addr instead. */
1963                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1964                         rth = ERR_PTR(-ENETUNREACH);
1965                         goto out;
1966                 }
1967                 if (ipv4_is_local_multicast(fl4->daddr) ||
1968                     ipv4_is_lbcast(fl4->daddr)) {
1969                         if (!fl4->saddr)
1970                                 fl4->saddr = inet_select_addr(dev_out, 0,
1971                                                               RT_SCOPE_LINK);
1972                         goto make_route;
1973                 }
1974                 if (fl4->saddr) {
1975                         if (ipv4_is_multicast(fl4->daddr))
1976                                 fl4->saddr = inet_select_addr(dev_out, 0,
1977                                                               fl4->flowi4_scope);
1978                         else if (!fl4->daddr)
1979                                 fl4->saddr = inet_select_addr(dev_out, 0,
1980                                                               RT_SCOPE_HOST);
1981                 }
1982         }
1983
1984         if (!fl4->daddr) {
1985                 fl4->daddr = fl4->saddr;
1986                 if (!fl4->daddr)
1987                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1988                 dev_out = net->loopback_dev;
1989                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1990                 res.type = RTN_LOCAL;
1991                 flags |= RTCF_LOCAL;
1992                 goto make_route;
1993         }
1994
1995         if (fib_lookup(net, fl4, &res)) {
1996                 res.fi = NULL;
1997                 res.table = NULL;
1998                 if (fl4->flowi4_oif) {
1999                         /* Apparently, routing tables are wrong. Assume,
2000                            that the destination is on link.
2001
2002                            WHY? DW.
2003                            Because we are allowed to send to iface
2004                            even if it has NO routes and NO assigned
2005                            addresses. When oif is specified, routing
2006                            tables are looked up with only one purpose:
2007                            to catch if destination is gatewayed, rather than
2008                            direct. Moreover, if MSG_DONTROUTE is set,
2009                            we send packet, ignoring both routing tables
2010                            and ifaddr state. --ANK
2011
2012
2013                            We could make it even if oif is unknown,
2014                            likely IPv6, but we do not.
2015                          */
2016
2017                         if (fl4->saddr == 0)
2018                                 fl4->saddr = inet_select_addr(dev_out, 0,
2019                                                               RT_SCOPE_LINK);
2020                         res.type = RTN_UNICAST;
2021                         goto make_route;
2022                 }
2023                 rth = ERR_PTR(-ENETUNREACH);
2024                 goto out;
2025         }
2026
2027         if (res.type == RTN_LOCAL) {
2028                 if (!fl4->saddr) {
2029                         if (res.fi->fib_prefsrc)
2030                                 fl4->saddr = res.fi->fib_prefsrc;
2031                         else
2032                                 fl4->saddr = fl4->daddr;
2033                 }
2034                 dev_out = net->loopback_dev;
2035                 fl4->flowi4_oif = dev_out->ifindex;
2036                 flags |= RTCF_LOCAL;
2037                 goto make_route;
2038         }
2039
2040 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2041         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2042                 fib_select_multipath(&res);
2043         else
2044 #endif
2045         if (!res.prefixlen &&
2046             res.table->tb_num_default > 1 &&
2047             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2048                 fib_select_default(&res);
2049
2050         if (!fl4->saddr)
2051                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2052
2053         dev_out = FIB_RES_DEV(res);
2054         fl4->flowi4_oif = dev_out->ifindex;
2055
2056
2057 make_route:
2058         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2059
2060 out:
2061         rcu_read_unlock();
2062         return rth;
2063 }
2064 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2065
2066 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2067 {
2068         return NULL;
2069 }
2070
2071 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2072 {
2073         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2074
2075         return mtu ? : dst->dev->mtu;
2076 }
2077
2078 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2079                                           struct sk_buff *skb, u32 mtu)
2080 {
2081 }
2082
2083 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2084                                        struct sk_buff *skb)
2085 {
2086 }
2087
2088 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2089                                           unsigned long old)
2090 {
2091         return NULL;
2092 }
2093
2094 static struct dst_ops ipv4_dst_blackhole_ops = {
2095         .family                 =       AF_INET,
2096         .protocol               =       cpu_to_be16(ETH_P_IP),
2097         .check                  =       ipv4_blackhole_dst_check,
2098         .mtu                    =       ipv4_blackhole_mtu,
2099         .default_advmss         =       ipv4_default_advmss,
2100         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2101         .redirect               =       ipv4_rt_blackhole_redirect,
2102         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2103         .neigh_lookup           =       ipv4_neigh_lookup,
2104 };
2105
2106 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2107 {
2108         struct rtable *ort = (struct rtable *) dst_orig;
2109         struct rtable *rt;
2110
2111         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2112         if (rt) {
2113                 struct dst_entry *new = &rt->dst;
2114
2115                 new->__use = 1;
2116                 new->input = dst_discard;
2117                 new->output = dst_discard;
2118
2119                 new->dev = ort->dst.dev;
2120                 if (new->dev)
2121                         dev_hold(new->dev);
2122
2123                 rt->rt_is_input = ort->rt_is_input;
2124                 rt->rt_iif = ort->rt_iif;
2125                 rt->rt_pmtu = ort->rt_pmtu;
2126
2127                 rt->rt_genid = rt_genid(net);
2128                 rt->rt_flags = ort->rt_flags;
2129                 rt->rt_type = ort->rt_type;
2130                 rt->rt_gateway = ort->rt_gateway;
2131
2132                 INIT_LIST_HEAD(&rt->rt_uncached);
2133
2134                 dst_free(new);
2135         }
2136
2137         dst_release(dst_orig);
2138
2139         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2140 }
2141
2142 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2143                                     struct sock *sk)
2144 {
2145         struct rtable *rt = __ip_route_output_key(net, flp4);
2146
2147         if (IS_ERR(rt))
2148                 return rt;
2149
2150         if (flp4->flowi4_proto)
2151                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2152                                                    flowi4_to_flowi(flp4),
2153                                                    sk, 0);
2154
2155         return rt;
2156 }
2157 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2158
2159 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2160                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2161                         u32 seq, int event, int nowait, unsigned int flags)
2162 {
2163         struct rtable *rt = skb_rtable(skb);
2164         struct rtmsg *r;
2165         struct nlmsghdr *nlh;
2166         unsigned long expires = 0;
2167         u32 error;
2168         u32 metrics[RTAX_MAX];
2169
2170         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2171         if (nlh == NULL)
2172                 return -EMSGSIZE;
2173
2174         r = nlmsg_data(nlh);
2175         r->rtm_family    = AF_INET;
2176         r->rtm_dst_len  = 32;
2177         r->rtm_src_len  = 0;
2178         r->rtm_tos      = fl4->flowi4_tos;
2179         r->rtm_table    = RT_TABLE_MAIN;
2180         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2181                 goto nla_put_failure;
2182         r->rtm_type     = rt->rt_type;
2183         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2184         r->rtm_protocol = RTPROT_UNSPEC;
2185         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2186         if (rt->rt_flags & RTCF_NOTIFY)
2187                 r->rtm_flags |= RTM_F_NOTIFY;
2188
2189         if (nla_put_be32(skb, RTA_DST, dst))
2190                 goto nla_put_failure;
2191         if (src) {
2192                 r->rtm_src_len = 32;
2193                 if (nla_put_be32(skb, RTA_SRC, src))
2194                         goto nla_put_failure;
2195         }
2196         if (rt->dst.dev &&
2197             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2198                 goto nla_put_failure;
2199 #ifdef CONFIG_IP_ROUTE_CLASSID
2200         if (rt->dst.tclassid &&
2201             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2202                 goto nla_put_failure;
2203 #endif
2204         if (!rt_is_input_route(rt) &&
2205             fl4->saddr != src) {
2206                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2207                         goto nla_put_failure;
2208         }
2209         if (rt->rt_gateway &&
2210             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2211                 goto nla_put_failure;
2212
2213         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2214         if (rt->rt_pmtu)
2215                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2216         if (rtnetlink_put_metrics(skb, metrics) < 0)
2217                 goto nla_put_failure;
2218
2219         if (fl4->flowi4_mark &&
2220             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2221                 goto nla_put_failure;
2222
2223         error = rt->dst.error;
2224         expires = rt->dst.expires;
2225         if (expires) {
2226                 if (time_before(jiffies, expires))
2227                         expires -= jiffies;
2228                 else
2229                         expires = 0;
2230         }
2231
2232         if (rt_is_input_route(rt)) {
2233                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2234                         goto nla_put_failure;
2235         }
2236
2237         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2238                 goto nla_put_failure;
2239
2240         return nlmsg_end(skb, nlh);
2241
2242 nla_put_failure:
2243         nlmsg_cancel(skb, nlh);
2244         return -EMSGSIZE;
2245 }
2246
2247 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2248 {
2249         struct net *net = sock_net(in_skb->sk);
2250         struct rtmsg *rtm;
2251         struct nlattr *tb[RTA_MAX+1];
2252         struct rtable *rt = NULL;
2253         struct flowi4 fl4;
2254         __be32 dst = 0;
2255         __be32 src = 0;
2256         u32 iif;
2257         int err;
2258         int mark;
2259         struct sk_buff *skb;
2260
2261         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2262         if (err < 0)
2263                 goto errout;
2264
2265         rtm = nlmsg_data(nlh);
2266
2267         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2268         if (skb == NULL) {
2269                 err = -ENOBUFS;
2270                 goto errout;
2271         }
2272
2273         /* Reserve room for dummy headers, this skb can pass
2274            through good chunk of routing engine.
2275          */
2276         skb_reset_mac_header(skb);
2277         skb_reset_network_header(skb);
2278
2279         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2280         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2281         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2282
2283         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2284         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2285         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2286         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2287
2288         memset(&fl4, 0, sizeof(fl4));
2289         fl4.daddr = dst;
2290         fl4.saddr = src;
2291         fl4.flowi4_tos = rtm->rtm_tos;
2292         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2293         fl4.flowi4_mark = mark;
2294
2295         if (iif) {
2296                 struct net_device *dev;
2297
2298                 dev = __dev_get_by_index(net, iif);
2299                 if (dev == NULL) {
2300                         err = -ENODEV;
2301                         goto errout_free;
2302                 }
2303
2304                 skb->protocol   = htons(ETH_P_IP);
2305                 skb->dev        = dev;
2306                 skb->mark       = mark;
2307                 local_bh_disable();
2308                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2309                 local_bh_enable();
2310
2311                 rt = skb_rtable(skb);
2312                 if (err == 0 && rt->dst.error)
2313                         err = -rt->dst.error;
2314         } else {
2315                 rt = ip_route_output_key(net, &fl4);
2316
2317                 err = 0;
2318                 if (IS_ERR(rt))
2319                         err = PTR_ERR(rt);
2320         }
2321
2322         if (err)
2323                 goto errout_free;
2324
2325         skb_dst_set(skb, &rt->dst);
2326         if (rtm->rtm_flags & RTM_F_NOTIFY)
2327                 rt->rt_flags |= RTCF_NOTIFY;
2328
2329         err = rt_fill_info(net, dst, src, &fl4, skb,
2330                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2331                            RTM_NEWROUTE, 0, 0);
2332         if (err <= 0)
2333                 goto errout_free;
2334
2335         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2336 errout:
2337         return err;
2338
2339 errout_free:
2340         kfree_skb(skb);
2341         goto errout;
2342 }
2343
2344 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2345 {
2346         return skb->len;
2347 }
2348
2349 void ip_rt_multicast_event(struct in_device *in_dev)
2350 {
2351         rt_cache_flush(dev_net(in_dev->dev), 0);
2352 }
2353
2354 #ifdef CONFIG_SYSCTL
2355 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2356                                         void __user *buffer,
2357                                         size_t *lenp, loff_t *ppos)
2358 {
2359         if (write) {
2360                 int flush_delay;
2361                 ctl_table ctl;
2362                 struct net *net;
2363
2364                 memcpy(&ctl, __ctl, sizeof(ctl));
2365                 ctl.data = &flush_delay;
2366                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2367
2368                 net = (struct net *)__ctl->extra1;
2369                 rt_cache_flush(net, flush_delay);
2370                 return 0;
2371         }
2372
2373         return -EINVAL;
2374 }
2375
2376 static ctl_table ipv4_route_table[] = {
2377         {
2378                 .procname       = "gc_thresh",
2379                 .data           = &ipv4_dst_ops.gc_thresh,
2380                 .maxlen         = sizeof(int),
2381                 .mode           = 0644,
2382                 .proc_handler   = proc_dointvec,
2383         },
2384         {
2385                 .procname       = "max_size",
2386                 .data           = &ip_rt_max_size,
2387                 .maxlen         = sizeof(int),
2388                 .mode           = 0644,
2389                 .proc_handler   = proc_dointvec,
2390         },
2391         {
2392                 /*  Deprecated. Use gc_min_interval_ms */
2393
2394                 .procname       = "gc_min_interval",
2395                 .data           = &ip_rt_gc_min_interval,
2396                 .maxlen         = sizeof(int),
2397                 .mode           = 0644,
2398                 .proc_handler   = proc_dointvec_jiffies,
2399         },
2400         {
2401                 .procname       = "gc_min_interval_ms",
2402                 .data           = &ip_rt_gc_min_interval,
2403                 .maxlen         = sizeof(int),
2404                 .mode           = 0644,
2405                 .proc_handler   = proc_dointvec_ms_jiffies,
2406         },
2407         {
2408                 .procname       = "gc_timeout",
2409                 .data           = &ip_rt_gc_timeout,
2410                 .maxlen         = sizeof(int),
2411                 .mode           = 0644,
2412                 .proc_handler   = proc_dointvec_jiffies,
2413         },
2414         {
2415                 .procname       = "gc_interval",
2416                 .data           = &ip_rt_gc_interval,
2417                 .maxlen         = sizeof(int),
2418                 .mode           = 0644,
2419                 .proc_handler   = proc_dointvec_jiffies,
2420         },
2421         {
2422                 .procname       = "redirect_load",
2423                 .data           = &ip_rt_redirect_load,
2424                 .maxlen         = sizeof(int),
2425                 .mode           = 0644,
2426                 .proc_handler   = proc_dointvec,
2427         },
2428         {
2429                 .procname       = "redirect_number",
2430                 .data           = &ip_rt_redirect_number,
2431                 .maxlen         = sizeof(int),
2432                 .mode           = 0644,
2433                 .proc_handler   = proc_dointvec,
2434         },
2435         {
2436                 .procname       = "redirect_silence",
2437                 .data           = &ip_rt_redirect_silence,
2438                 .maxlen         = sizeof(int),
2439                 .mode           = 0644,
2440                 .proc_handler   = proc_dointvec,
2441         },
2442         {
2443                 .procname       = "error_cost",
2444                 .data           = &ip_rt_error_cost,
2445                 .maxlen         = sizeof(int),
2446                 .mode           = 0644,
2447                 .proc_handler   = proc_dointvec,
2448         },
2449         {
2450                 .procname       = "error_burst",
2451                 .data           = &ip_rt_error_burst,
2452                 .maxlen         = sizeof(int),
2453                 .mode           = 0644,
2454                 .proc_handler   = proc_dointvec,
2455         },
2456         {
2457                 .procname       = "gc_elasticity",
2458                 .data           = &ip_rt_gc_elasticity,
2459                 .maxlen         = sizeof(int),
2460                 .mode           = 0644,
2461                 .proc_handler   = proc_dointvec,
2462         },
2463         {
2464                 .procname       = "mtu_expires",
2465                 .data           = &ip_rt_mtu_expires,
2466                 .maxlen         = sizeof(int),
2467                 .mode           = 0644,
2468                 .proc_handler   = proc_dointvec_jiffies,
2469         },
2470         {
2471                 .procname       = "min_pmtu",
2472                 .data           = &ip_rt_min_pmtu,
2473                 .maxlen         = sizeof(int),
2474                 .mode           = 0644,
2475                 .proc_handler   = proc_dointvec,
2476         },
2477         {
2478                 .procname       = "min_adv_mss",
2479                 .data           = &ip_rt_min_advmss,
2480                 .maxlen         = sizeof(int),
2481                 .mode           = 0644,
2482                 .proc_handler   = proc_dointvec,
2483         },
2484         { }
2485 };
2486
2487 static struct ctl_table ipv4_route_flush_table[] = {
2488         {
2489                 .procname       = "flush",
2490                 .maxlen         = sizeof(int),
2491                 .mode           = 0200,
2492                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2493         },
2494         { },
2495 };
2496
2497 static __net_init int sysctl_route_net_init(struct net *net)
2498 {
2499         struct ctl_table *tbl;
2500
2501         tbl = ipv4_route_flush_table;
2502         if (!net_eq(net, &init_net)) {
2503                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2504                 if (tbl == NULL)
2505                         goto err_dup;
2506         }
2507         tbl[0].extra1 = net;
2508
2509         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2510         if (net->ipv4.route_hdr == NULL)
2511                 goto err_reg;
2512         return 0;
2513
2514 err_reg:
2515         if (tbl != ipv4_route_flush_table)
2516                 kfree(tbl);
2517 err_dup:
2518         return -ENOMEM;
2519 }
2520
2521 static __net_exit void sysctl_route_net_exit(struct net *net)
2522 {
2523         struct ctl_table *tbl;
2524
2525         tbl = net->ipv4.route_hdr->ctl_table_arg;
2526         unregister_net_sysctl_table(net->ipv4.route_hdr);
2527         BUG_ON(tbl == ipv4_route_flush_table);
2528         kfree(tbl);
2529 }
2530
2531 static __net_initdata struct pernet_operations sysctl_route_ops = {
2532         .init = sysctl_route_net_init,
2533         .exit = sysctl_route_net_exit,
2534 };
2535 #endif
2536
2537 static __net_init int rt_genid_init(struct net *net)
2538 {
2539         get_random_bytes(&net->ipv4.rt_genid,
2540                          sizeof(net->ipv4.rt_genid));
2541         get_random_bytes(&net->ipv4.dev_addr_genid,
2542                          sizeof(net->ipv4.dev_addr_genid));
2543         return 0;
2544 }
2545
2546 static __net_initdata struct pernet_operations rt_genid_ops = {
2547         .init = rt_genid_init,
2548 };
2549
2550 static int __net_init ipv4_inetpeer_init(struct net *net)
2551 {
2552         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2553
2554         if (!bp)
2555                 return -ENOMEM;
2556         inet_peer_base_init(bp);
2557         net->ipv4.peers = bp;
2558         return 0;
2559 }
2560
2561 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2562 {
2563         struct inet_peer_base *bp = net->ipv4.peers;
2564
2565         net->ipv4.peers = NULL;
2566         inetpeer_invalidate_tree(bp);
2567         kfree(bp);
2568 }
2569
2570 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2571         .init   =       ipv4_inetpeer_init,
2572         .exit   =       ipv4_inetpeer_exit,
2573 };
2574
2575 #ifdef CONFIG_IP_ROUTE_CLASSID
2576 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2577 #endif /* CONFIG_IP_ROUTE_CLASSID */
2578
2579 int __init ip_rt_init(void)
2580 {
2581         int rc = 0;
2582
2583 #ifdef CONFIG_IP_ROUTE_CLASSID
2584         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2585         if (!ip_rt_acct)
2586                 panic("IP: failed to allocate ip_rt_acct\n");
2587 #endif
2588
2589         ipv4_dst_ops.kmem_cachep =
2590                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2591                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2592
2593         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2594
2595         if (dst_entries_init(&ipv4_dst_ops) < 0)
2596                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2597
2598         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2599                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2600
2601         ipv4_dst_ops.gc_thresh = ~0;
2602         ip_rt_max_size = INT_MAX;
2603
2604         devinet_init();
2605         ip_fib_init();
2606
2607         if (ip_rt_proc_init())
2608                 pr_err("Unable to create route proc files\n");
2609 #ifdef CONFIG_XFRM
2610         xfrm_init();
2611         xfrm4_init(ip_rt_max_size);
2612 #endif
2613         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2614
2615 #ifdef CONFIG_SYSCTL
2616         register_pernet_subsys(&sysctl_route_ops);
2617 #endif
2618         register_pernet_subsys(&rt_genid_ops);
2619         register_pernet_subsys(&ipv4_inetpeer_ops);
2620         return rc;
2621 }
2622
2623 #ifdef CONFIG_SYSCTL
2624 /*
2625  * We really need to sanitize the damn ipv4 init order, then all
2626  * this nonsense will go away.
2627  */
2628 void __init ip_static_sysctl_init(void)
2629 {
2630         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2631 }
2632 #endif