net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <linux/module.h>
  47 #include <linux/types.h>
  48 #include <linux/kernel.h>
  49 #include <linux/mm.h>
  50 #include <linux/string.h>
  51 #include <linux/errno.h>
  52 #include <linux/highmem.h>
  53 #include <linux/slab.h>
  54
  55 #include <linux/socket.h>
  56 #include <linux/sockios.h>
  57 #include <linux/in.h>
  58 #include <linux/inet.h>
  59 #include <linux/netdevice.h>
  60 #include <linux/etherdevice.h>
  61 #include <linux/proc_fs.h>
  62 #include <linux/stat.h>
  63 #include <linux/init.h>
  64
  65 #include <net/snmp.h>
  66 #include <net/ip.h>
  67 #include <net/protocol.h>
  68 #include <net/route.h>
  69 #include <net/xfrm.h>
  70 #include <linux/skbuff.h>
  71 #include <net/sock.h>
  72 #include <net/arp.h>
  73 #include <net/icmp.h>
  74 #include <net/checksum.h>
  75 #include <net/inetpeer.h>
  76 #include <linux/igmp.h>
  77 #include <linux/netfilter_ipv4.h>
  78 #include <linux/netfilter_bridge.h>
  79 #include <linux/mroute.h>
  80 #include <linux/netlink.h>
  81 #include <linux/tcp.h>
  82
  83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
  85
  86 /* Generate a checksum for an outgoing IP datagram. */
  87 __inline__ void ip_send_check(struct iphdr *iph)
  88 {
  89         iph->check = 0;
  90         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  91 }
  92 EXPORT_SYMBOL(ip_send_check);
  93
  94 int __ip_local_out(struct sk_buff *skb)
  95 {
  96         struct iphdr *iph = ip_hdr(skb);
  97
  98         iph->tot_len = htons(skb->len);
  99         ip_send_check(iph);
 100         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 101                        skb_dst(skb)->dev, dst_output);
 102 }
 103
 104 int ip_local_out(struct sk_buff *skb)
 105 {
 106         int err;
 107
 108         err = __ip_local_out(skb);
 109         if (likely(err == 1))
 110                 err = dst_output(skb);
 111
 112         return err;
 113 }
 114 EXPORT_SYMBOL_GPL(ip_local_out);
 115
 116 /* dev_loopback_xmit for use with netfilter. */
 117 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 118 {
 119         skb_reset_mac_header(newskb);
 120         __skb_pull(newskb, skb_network_offset(newskb));
 121         newskb->pkt_type = PACKET_LOOPBACK;
 122         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 123         WARN_ON(!skb_dst(newskb));
 124         skb_dst_force(newskb);
 125         netif_rx_ni(newskb);
 126         return 0;
 127 }
 128
 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130 {
 131         int ttl = inet->uc_ttl;
 132
 133         if (ttl < 0)
 134                 ttl = ip4_dst_hoplimit(dst);
 135         return ttl;
 136 }
 137
 138 /*
 139  *              Add an ip header to a skbuff and send it out.
 140  *
 141  */
 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                           __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 144 {
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct rtable *rt = skb_rtable(skb);
 147         struct iphdr *iph;
 148
 149         /* Build the IP header. */
 150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
 151         skb_reset_network_header(skb);
 152         iph = ip_hdr(skb);
 153         iph->version  = 4;
 154         iph->ihl      = 5;
 155         iph->tos      = inet->tos;
 156         if (ip_dont_fragment(sk, &rt->dst))
 157                 iph->frag_off = htons(IP_DF);
 158         else
 159                 iph->frag_off = 0;
 160         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 161         iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
 162         iph->saddr    = saddr;
 163         iph->protocol = sk->sk_protocol;
 164         ip_select_ident(skb, sk);
 165
 166         if (opt && opt->opt.optlen) {
 167                 iph->ihl += opt->opt.optlen>>2;
 168                 ip_options_build(skb, &opt->opt, daddr, rt, 0);
 169         }
 170
 171         skb->priority = sk->sk_priority;
 172         skb->mark = sk->sk_mark;
 173
 174         /* Send it out. */
 175         return ip_local_out(skb);
 176 }
 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179 static inline int ip_finish_output2(struct sk_buff *skb)
 180 {
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct rtable *rt = (struct rtable *)dst;
 183         struct net_device *dev = dst->dev;
 184         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185         struct neighbour *neigh;
 186
 187         if (rt->rt_type == RTN_MULTICAST) {
 188                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 189         } else if (rt->rt_type == RTN_BROADCAST)
 190                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 191
 192         /* Be paranoid, rather than too clever. */
 193         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 194                 struct sk_buff *skb2;
 195
 196                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 197                 if (skb2 == NULL) {
 198                         kfree_skb(skb);
 199                         return -ENOMEM;
 200                 }
 201                 if (skb->sk)
 202                         skb_set_owner_w(skb2, skb->sk);
 203                 kfree_skb(skb);
 204                 skb = skb2;
 205         }
 206
 207         rcu_read_lock();
 208         neigh = dst_get_neighbour_noref(dst);
 209         if (neigh) {
 210                 int res = neigh_output(neigh, skb);
 211
 212                 rcu_read_unlock();
 213                 return res;
 214         }
 215         rcu_read_unlock();
 216
 217         if (net_ratelimit())
 218                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 219         kfree_skb(skb);
 220         return -EINVAL;
 221 }
 222
 223 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 224 {
 225         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 226
 227         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 228                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 229 }
 230
 231 static int ip_finish_output(struct sk_buff *skb)
 232 {
 233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 234         /* Policy lookup after SNAT yielded a new policy */
 235         if (skb_dst(skb)->xfrm != NULL) {
 236                 IPCB(skb)->flags |= IPSKB_REROUTED;
 237                 return dst_output(skb);
 238         }
 239 #endif
 240         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 241                 return ip_fragment(skb, ip_finish_output2);
 242         else
 243                 return ip_finish_output2(skb);
 244 }
 245
 246 int ip_mc_output(struct sk_buff *skb)
 247 {
 248         struct sock *sk = skb->sk;
 249         struct rtable *rt = skb_rtable(skb);
 250         struct net_device *dev = rt->dst.dev;
 251
 252         /*
 253          *      If the indicated interface is up and running, send the packet.
 254          */
 255         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 256
 257         skb->dev = dev;
 258         skb->protocol = htons(ETH_P_IP);
 259
 260         /*
 261          *      Multicasts are looped back for other local users
 262          */
 263
 264         if (rt->rt_flags&RTCF_MULTICAST) {
 265                 if (sk_mc_loop(sk)
 266 #ifdef CONFIG_IP_MROUTE
 267                 /* Small optimization: do not loopback not local frames,
 268                    which returned after forwarding; they will be  dropped
 269                    by ip_mr_input in any case.
 270                    Note, that local frames are looped back to be delivered
 271                    to local recipients.
 272
 273                    This check is duplicated in ip_mr_input at the moment.
 274                  */
 275                     &&
 276                     ((rt->rt_flags & RTCF_LOCAL) ||
 277                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 278 #endif
 279                    ) {
 280                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 281                         if (newskb)
 282                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 283                                         newskb, NULL, newskb->dev,
 284                                         ip_dev_loopback_xmit);
 285                 }
 286
 287                 /* Multicasts with ttl 0 must not go beyond the host */
 288
 289                 if (ip_hdr(skb)->ttl == 0) {
 290                         kfree_skb(skb);
 291                         return 0;
 292                 }
 293         }
 294
 295         if (rt->rt_flags&RTCF_BROADCAST) {
 296                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 297                 if (newskb)
 298                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 299                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 300         }
 301
 302         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 303                             skb->dev, ip_finish_output,
 304                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 305 }
 306
 307 int ip_output(struct sk_buff *skb)
 308 {
 309         struct net_device *dev = skb_dst(skb)->dev;
 310
 311         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 312
 313         skb->dev = dev;
 314         skb->protocol = htons(ETH_P_IP);
 315
 316         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 317                             ip_finish_output,
 318                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 319 }
 320
 321 /*
 322  * copy saddr and daddr, possibly using 64bit load/stores
 323  * Equivalent to :
 324  *   iph->saddr = fl4->saddr;
 325  *   iph->daddr = fl4->daddr;
 326  */
 327 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
 328 {
 329         BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
 330                      offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
 331         memcpy(&iph->saddr, &fl4->saddr,
 332                sizeof(fl4->saddr) + sizeof(fl4->daddr));
 333 }
 334
 335 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 336 {
 337         struct sock *sk = skb->sk;
 338         struct inet_sock *inet = inet_sk(sk);
 339         struct ip_options_rcu *inet_opt;
 340         struct flowi4 *fl4;
 341         struct rtable *rt;
 342         struct iphdr *iph;
 343         int res;
 344
 345         /* Skip all of this if the packet is already routed,
 346          * f.e. by something like SCTP.
 347          */
 348         rcu_read_lock();
 349         inet_opt = rcu_dereference(inet->inet_opt);
 350         fl4 = &fl->u.ip4;
 351         rt = skb_rtable(skb);
 352         if (rt != NULL)
 353                 goto packet_routed;
 354
 355         /* Make sure we can route this packet. */
 356         rt = (struct rtable *)__sk_dst_check(sk, 0);
 357         if (rt == NULL) {
 358                 __be32 daddr;
 359
 360                 /* Use correct destination address if we have options. */
 361                 daddr = inet->inet_daddr;
 362                 if (inet_opt && inet_opt->opt.srr)
 363                         daddr = inet_opt->opt.faddr;
 364
 365                 /* If this fails, retransmit mechanism of transport layer will
 366                  * keep trying until route appears or the connection times
 367                  * itself out.
 368                  */
 369                 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
 370                                            daddr, inet->inet_saddr,
 371                                            inet->inet_dport,
 372                                            inet->inet_sport,
 373                                            sk->sk_protocol,
 374                                            RT_CONN_FLAGS(sk),
 375                                            sk->sk_bound_dev_if);
 376                 if (IS_ERR(rt))
 377                         goto no_route;
 378                 sk_setup_caps(sk, &rt->dst);
 379         }
 380         skb_dst_set_noref(skb, &rt->dst);
 381
 382 packet_routed:
 383         if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
 384                 goto no_route;
 385
 386         /* OK, we know where to send it, allocate and build IP header. */
 387         skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 388         skb_reset_network_header(skb);
 389         iph = ip_hdr(skb);
 390         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 391         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 392                 iph->frag_off = htons(IP_DF);
 393         else
 394                 iph->frag_off = 0;
 395         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 396         iph->protocol = sk->sk_protocol;
 397         ip_copy_addrs(iph, fl4);
 398
 399         /* Transport layer set skb->h.foo itself. */
 400
 401         if (inet_opt && inet_opt->opt.optlen) {
 402                 iph->ihl += inet_opt->opt.optlen >> 2;
 403                 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 404         }
 405
 406         ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
 407
 408         skb->priority = sk->sk_priority;
 409         skb->mark = sk->sk_mark;
 410
 411         res = ip_local_out(skb);
 412         rcu_read_unlock();
 413         return res;
 414
 415 no_route:
 416         rcu_read_unlock();
 417         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 418         kfree_skb(skb);
 419         return -EHOSTUNREACH;
 420 }
 421 EXPORT_SYMBOL(ip_queue_xmit);
 422
 423
 424 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 425 {
 426         to->pkt_type = from->pkt_type;
 427         to->priority = from->priority;
 428         to->protocol = from->protocol;
 429         skb_dst_drop(to);
 430         skb_dst_copy(to, from);
 431         to->dev = from->dev;
 432         to->mark = from->mark;
 433
 434         /* Copy the flags to each fragment. */
 435         IPCB(to)->flags = IPCB(from)->flags;
 436
 437 #ifdef CONFIG_NET_SCHED
 438         to->tc_index = from->tc_index;
 439 #endif
 440         nf_copy(to, from);
 441 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 442     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 443         to->nf_trace = from->nf_trace;
 444 #endif
 445 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 446         to->ipvs_property = from->ipvs_property;
 447 #endif
 448         skb_copy_secmark(to, from);
 449 }
 450
 451 /*
 452  *      This IP datagram is too large to be sent in one piece.  Break it up into
 453  *      smaller pieces (each of size equal to IP header plus
 454  *      a block of the data of the original IP data part) that will yet fit in a
 455  *      single device frame, and queue such a frame for sending.
 456  */
 457
 458 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 459 {
 460         struct iphdr *iph;
 461         int ptr;
 462         struct net_device *dev;
 463         struct sk_buff *skb2;
 464         unsigned int mtu, hlen, left, len, ll_rs;
 465         int offset;
 466         __be16 not_last_frag;
 467         struct rtable *rt = skb_rtable(skb);
 468         int err = 0;
 469
 470         dev = rt->dst.dev;
 471
 472         /*
 473          *      Point into the IP datagram header.
 474          */
 475
 476         iph = ip_hdr(skb);
 477
 478         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 479                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 480                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 481                           htonl(ip_skb_dst_mtu(skb)));
 482                 kfree_skb(skb);
 483                 return -EMSGSIZE;
 484         }
 485
 486         /*
 487          *      Setup starting values.
 488          */
 489
 490         hlen = iph->ihl * 4;
 491         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 492 #ifdef CONFIG_BRIDGE_NETFILTER
 493         if (skb->nf_bridge)
 494                 mtu -= nf_bridge_mtu_reduction(skb);
 495 #endif
 496         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 497
 498         /* When frag_list is given, use it. First, check its validity:
 499          * some transformers could create wrong frag_list or break existing
 500          * one, it is not prohibited. In this case fall back to copying.
 501          *
 502          * LATER: this step can be merged to real generation of fragments,
 503          * we can switch to copy when see the first bad fragment.
 504          */
 505         if (skb_has_frag_list(skb)) {
 506                 struct sk_buff *frag, *frag2;
 507                 int first_len = skb_pagelen(skb);
 508
 509                 if (first_len - hlen > mtu ||
 510                     ((first_len - hlen) & 7) ||
 511                     ip_is_fragment(iph) ||
 512                     skb_cloned(skb))
 513                         goto slow_path;
 514
 515                 skb_walk_frags(skb, frag) {
 516                         /* Correct geometry. */
 517                         if (frag->len > mtu ||
 518                             ((frag->len & 7) && frag->next) ||
 519                             skb_headroom(frag) < hlen)
 520                                 goto slow_path_clean;
 521
 522                         /* Partially cloned skb? */
 523                         if (skb_shared(frag))
 524                                 goto slow_path_clean;
 525
 526                         BUG_ON(frag->sk);
 527                         if (skb->sk) {
 528                                 frag->sk = skb->sk;
 529                                 frag->destructor = sock_wfree;
 530                         }
 531                         skb->truesize -= frag->truesize;
 532                 }
 533
 534                 /* Everything is OK. Generate! */
 535
 536                 err = 0;
 537                 offset = 0;
 538                 frag = skb_shinfo(skb)->frag_list;
 539                 skb_frag_list_init(skb);
 540                 skb->data_len = first_len - skb_headlen(skb);
 541                 skb->len = first_len;
 542                 iph->tot_len = htons(first_len);
 543                 iph->frag_off = htons(IP_MF);
 544                 ip_send_check(iph);
 545
 546                 for (;;) {
 547                         /* Prepare header of the next frame,
 548                          * before previous one went down. */
 549                         if (frag) {
 550                                 frag->ip_summed = CHECKSUM_NONE;
 551                                 skb_reset_transport_header(frag);
 552                                 __skb_push(frag, hlen);
 553                                 skb_reset_network_header(frag);
 554                                 memcpy(skb_network_header(frag), iph, hlen);
 555                                 iph = ip_hdr(frag);
 556                                 iph->tot_len = htons(frag->len);
 557                                 ip_copy_metadata(frag, skb);
 558                                 if (offset == 0)
 559                                         ip_options_fragment(frag);
 560                                 offset += skb->len - hlen;
 561                                 iph->frag_off = htons(offset>>3);
 562                                 if (frag->next != NULL)
 563                                         iph->frag_off |= htons(IP_MF);
 564                                 /* Ready, complete checksum */
 565                                 ip_send_check(iph);
 566                         }
 567
 568                         err = output(skb);
 569
 570                         if (!err)
 571                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 572                         if (err || !frag)
 573                                 break;
 574
 575                         skb = frag;
 576                         frag = skb->next;
 577                         skb->next = NULL;
 578                 }
 579
 580                 if (err == 0) {
 581                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 582                         return 0;
 583                 }
 584
 585                 while (frag) {
 586                         skb = frag->next;
 587                         kfree_skb(frag);
 588                         frag = skb;
 589                 }
 590                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 591                 return err;
 592
 593 slow_path_clean:
 594                 skb_walk_frags(skb, frag2) {
 595                         if (frag2 == frag)
 596                                 break;
 597                         frag2->sk = NULL;
 598                         frag2->destructor = NULL;
 599                         skb->truesize += frag2->truesize;
 600                 }
 601         }
 602
 603 slow_path:
 604         left = skb->len - hlen;         /* Space per frame */
 605         ptr = hlen;             /* Where to start from */
 606
 607         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 608          * we need to make room for the encapsulating header
 609          */
 610         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 611
 612         /*
 613          *      Fragment the datagram.
 614          */
 615
 616         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 617         not_last_frag = iph->frag_off & htons(IP_MF);
 618
 619         /*
 620          *      Keep copying data until we run out.
 621          */
 622
 623         while (left > 0) {
 624                 len = left;
 625                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 626                 if (len > mtu)
 627                         len = mtu;
 628                 /* IF: we are not sending up to and including the packet end
 629                    then align the next start on an eight byte boundary */
 630                 if (len < left) {
 631                         len &= ~7;
 632                 }
 633                 /*
 634                  *      Allocate buffer.
 635                  */
 636
 637                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 638                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 639                         err = -ENOMEM;
 640                         goto fail;
 641                 }
 642
 643                 /*
 644                  *      Set up data on packet
 645                  */
 646
 647                 ip_copy_metadata(skb2, skb);
 648                 skb_reserve(skb2, ll_rs);
 649                 skb_put(skb2, len + hlen);
 650                 skb_reset_network_header(skb2);
 651                 skb2->transport_header = skb2->network_header + hlen;
 652
 653                 /*
 654                  *      Charge the memory for the fragment to any owner
 655                  *      it might possess
 656                  */
 657
 658                 if (skb->sk)
 659                         skb_set_owner_w(skb2, skb->sk);
 660
 661                 /*
 662                  *      Copy the packet header into the new buffer.
 663                  */
 664
 665                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 666
 667                 /*
 668                  *      Copy a block of the IP datagram.
 669                  */
 670                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 671                         BUG();
 672                 left -= len;
 673
 674                 /*
 675                  *      Fill in the new header fields.
 676                  */
 677                 iph = ip_hdr(skb2);
 678                 iph->frag_off = htons((offset >> 3));
 679
 680                 /* ANK: dirty, but effective trick. Upgrade options only if
 681                  * the segment to be fragmented was THE FIRST (otherwise,
 682                  * options are already fixed) and make it ONCE
 683                  * on the initial skb, so that all the following fragments
 684                  * will inherit fixed options.
 685                  */
 686                 if (offset == 0)
 687                         ip_options_fragment(skb);
 688
 689                 /*
 690                  *      Added AC : If we are fragmenting a fragment that's not the
 691                  *                 last fragment then keep MF on each bit
 692                  */
 693                 if (left > 0 || not_last_frag)
 694                         iph->frag_off |= htons(IP_MF);
 695                 ptr += len;
 696                 offset += len;
 697
 698                 /*
 699                  *      Put this fragment into the sending queue.
 700                  */
 701                 iph->tot_len = htons(len + hlen);
 702
 703                 ip_send_check(iph);
 704
 705                 err = output(skb2);
 706                 if (err)
 707                         goto fail;
 708
 709                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 710         }
 711         kfree_skb(skb);
 712         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 713         return err;
 714
 715 fail:
 716         kfree_skb(skb);
 717         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 718         return err;
 719 }
 720 EXPORT_SYMBOL(ip_fragment);
 721
 722 int
 723 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 724 {
 725         struct iovec *iov = from;
 726
 727         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 728                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 729                         return -EFAULT;
 730         } else {
 731                 __wsum csum = 0;
 732                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 733                         return -EFAULT;
 734                 skb->csum = csum_block_add(skb->csum, csum, odd);
 735         }
 736         return 0;
 737 }
 738 EXPORT_SYMBOL(ip_generic_getfrag);
 739
 740 static inline __wsum
 741 csum_page(struct page *page, int offset, int copy)
 742 {
 743         char *kaddr;
 744         __wsum csum;
 745         kaddr = kmap(page);
 746         csum = csum_partial(kaddr + offset, copy, 0);
 747         kunmap(page);
 748         return csum;
 749 }
 750
 751 static inline int ip_ufo_append_data(struct sock *sk,
 752                         struct sk_buff_head *queue,
 753                         int getfrag(void *from, char *to, int offset, int len,
 754                                int odd, struct sk_buff *skb),
 755                         void *from, int length, int hh_len, int fragheaderlen,
 756                         int transhdrlen, int maxfraglen, unsigned int flags)
 757 {
 758         struct sk_buff *skb;
 759         int err;
 760
 761         /* There is support for UDP fragmentation offload by network
 762          * device, so create one single skb packet containing complete
 763          * udp datagram
 764          */
 765         if ((skb = skb_peek_tail(queue)) == NULL) {
 766                 skb = sock_alloc_send_skb(sk,
 767                         hh_len + fragheaderlen + transhdrlen + 20,
 768                         (flags & MSG_DONTWAIT), &err);
 769
 770                 if (skb == NULL)
 771                         return err;
 772
 773                 /* reserve space for Hardware header */
 774                 skb_reserve(skb, hh_len);
 775
 776                 /* create space for UDP/IP header */
 777                 skb_put(skb, fragheaderlen + transhdrlen);
 778
 779                 /* initialize network header pointer */
 780                 skb_reset_network_header(skb);
 781
 782                 /* initialize protocol header pointer */
 783                 skb->transport_header = skb->network_header + fragheaderlen;
 784
 785                 skb->ip_summed = CHECKSUM_PARTIAL;
 786                 skb->csum = 0;
 787
 788                 /* specify the length of each IP datagram fragment */
 789                 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
 790                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 791                 __skb_queue_tail(queue, skb);
 792         }
 793
 794         return skb_append_datato_frags(sk, skb, getfrag, from,
 795                                        (length - transhdrlen));
 796 }
 797
 798 static int __ip_append_data(struct sock *sk,
 799                             struct flowi4 *fl4,
 800                             struct sk_buff_head *queue,
 801                             struct inet_cork *cork,
 802                             int getfrag(void *from, char *to, int offset,
 803                                         int len, int odd, struct sk_buff *skb),
 804                             void *from, int length, int transhdrlen,
 805                             unsigned int flags)
 806 {
 807         struct inet_sock *inet = inet_sk(sk);
 808         struct sk_buff *skb;
 809
 810         struct ip_options *opt = cork->opt;
 811         int hh_len;
 812         int exthdrlen;
 813         int mtu;
 814         int copy;
 815         int err;
 816         int offset = 0;
 817         unsigned int maxfraglen, fragheaderlen;
 818         int csummode = CHECKSUM_NONE;
 819         struct rtable *rt = (struct rtable *)cork->dst;
 820
 821         skb = skb_peek_tail(queue);
 822
 823         exthdrlen = !skb ? rt->dst.header_len : 0;
 824         mtu = cork->fragsize;
 825
 826         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 827
 828         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 829         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 830
 831         if (cork->length + length > 0xFFFF - fragheaderlen) {
 832                 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
 833                                mtu-exthdrlen);
 834                 return -EMSGSIZE;
 835         }
 836
 837         /*
 838          * transhdrlen > 0 means that this is the first fragment and we wish
 839          * it won't be fragmented in the future.
 840          */
 841         if (transhdrlen &&
 842             length + fragheaderlen <= mtu &&
 843             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 844             !exthdrlen)
 845                 csummode = CHECKSUM_PARTIAL;
 846
 847         cork->length += length;
 848         if (((length > mtu) || (skb && skb_has_frags(skb))) &&
 849             (sk->sk_protocol == IPPROTO_UDP) &&
 850             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
 851                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 852                                          hh_len, fragheaderlen, transhdrlen,
 853                                          maxfraglen, flags);
 854                 if (err)
 855                         goto error;
 856                 return 0;
 857         }
 858
 859         /* So, what's going on in the loop below?
 860          *
 861          * We use calculated fragment length to generate chained skb,
 862          * each of segments is IP fragment ready for sending to network after
 863          * adding appropriate IP header.
 864          */
 865
 866         if (!skb)
 867                 goto alloc_new_skb;
 868
 869         while (length > 0) {
 870                 /* Check if the remaining data fits into current packet. */
 871                 copy = mtu - skb->len;
 872                 if (copy < length)
 873                         copy = maxfraglen - skb->len;
 874                 if (copy <= 0) {
 875                         char *data;
 876                         unsigned int datalen;
 877                         unsigned int fraglen;
 878                         unsigned int fraggap;
 879                         unsigned int alloclen;
 880                         struct sk_buff *skb_prev;
 881 alloc_new_skb:
 882                         skb_prev = skb;
 883                         if (skb_prev)
 884                                 fraggap = skb_prev->len - maxfraglen;
 885                         else
 886                                 fraggap = 0;
 887
 888                         /*
 889                          * If remaining data exceeds the mtu,
 890                          * we know we need more fragment(s).
 891                          */
 892                         datalen = length + fraggap;
 893                         if (datalen > mtu - fragheaderlen)
 894                                 datalen = maxfraglen - fragheaderlen;
 895                         fraglen = datalen + fragheaderlen;
 896
 897                         if ((flags & MSG_MORE) &&
 898                             !(rt->dst.dev->features&NETIF_F_SG))
 899                                 alloclen = mtu;
 900                         else
 901                                 alloclen = fraglen;
 902
 903                         alloclen += exthdrlen;
 904
 905                         /* The last fragment gets additional space at tail.
 906                          * Note, with MSG_MORE we overallocate on fragments,
 907                          * because we have no idea what fragment will be
 908                          * the last.
 909                          */
 910                         if (datalen == length + fraggap)
 911                                 alloclen += rt->dst.trailer_len;
 912
 913                         if (transhdrlen) {
 914                                 skb = sock_alloc_send_skb(sk,
 915                                                 alloclen + hh_len + 15,
 916                                                 (flags & MSG_DONTWAIT), &err);
 917                         } else {
 918                                 skb = NULL;
 919                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 920                                     2 * sk->sk_sndbuf)
 921                                         skb = sock_wmalloc(sk,
 922                                                            alloclen + hh_len + 15, 1,
 923                                                            sk->sk_allocation);
 924                                 if (unlikely(skb == NULL))
 925                                         err = -ENOBUFS;
 926                                 else
 927                                         /* only the initial fragment is
 928                                            time stamped */
 929                                         cork->tx_flags = 0;
 930                         }
 931                         if (skb == NULL)
 932                                 goto error;
 933
 934                         /*
 935                          *      Fill in the control structures
 936                          */
 937                         skb->ip_summed = csummode;
 938                         skb->csum = 0;
 939                         skb_reserve(skb, hh_len);
 940                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
 941
 942                         /*
 943                          *      Find where to start putting bytes.
 944                          */
 945                         data = skb_put(skb, fraglen + exthdrlen);
 946                         skb_set_network_header(skb, exthdrlen);
 947                         skb->transport_header = (skb->network_header +
 948                                                  fragheaderlen);
 949                         data += fragheaderlen + exthdrlen;
 950
 951                         if (fraggap) {
 952                                 skb->csum = skb_copy_and_csum_bits(
 953                                         skb_prev, maxfraglen,
 954                                         data + transhdrlen, fraggap, 0);
 955                                 skb_prev->csum = csum_sub(skb_prev->csum,
 956                                                           skb->csum);
 957                                 data += fraggap;
 958                                 pskb_trim_unique(skb_prev, maxfraglen);
 959                         }
 960
 961                         copy = datalen - transhdrlen - fraggap;
 962                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 963                                 err = -EFAULT;
 964                                 kfree_skb(skb);
 965                                 goto error;
 966                         }
 967
 968                         offset += copy;
 969                         length -= datalen - fraggap;
 970                         transhdrlen = 0;
 971                         exthdrlen = 0;
 972                         csummode = CHECKSUM_NONE;
 973
 974                         /*
 975                          * Put the packet on the pending queue.
 976                          */
 977                         __skb_queue_tail(queue, skb);
 978                         continue;
 979                 }
 980
 981                 if (copy > length)
 982                         copy = length;
 983
 984                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
 985                         unsigned int off;
 986
 987                         off = skb->len;
 988                         if (getfrag(from, skb_put(skb, copy),
 989                                         offset, copy, off, skb) < 0) {
 990                                 __skb_trim(skb, off);
 991                                 err = -EFAULT;
 992                                 goto error;
 993                         }
 994                 } else {
 995                         int i = skb_shinfo(skb)->nr_frags;
 996                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 997                         struct page *page = cork->page;
 998                         int off = cork->off;
 999                         unsigned int left;
1000
1001                         if (page && (left = PAGE_SIZE - off) > 0) {
1002                                 if (copy >= left)
1003                                         copy = left;
1004                                 if (page != skb_frag_page(frag)) {
1005                                         if (i == MAX_SKB_FRAGS) {
1006                                                 err = -EMSGSIZE;
1007                                                 goto error;
1008                                         }
1009                                         skb_fill_page_desc(skb, i, page, off, 0);
1010                                         skb_frag_ref(skb, i);
1011                                         frag = &skb_shinfo(skb)->frags[i];
1012                                 }
1013                         } else if (i < MAX_SKB_FRAGS) {
1014                                 if (copy > PAGE_SIZE)
1015                                         copy = PAGE_SIZE;
1016                                 page = alloc_pages(sk->sk_allocation, 0);
1017                                 if (page == NULL)  {
1018                                         err = -ENOMEM;
1019                                         goto error;
1020                                 }
1021                                 cork->page = page;
1022                                 cork->off = 0;
1023
1024                                 skb_fill_page_desc(skb, i, page, 0, 0);
1025                                 frag = &skb_shinfo(skb)->frags[i];
1026                         } else {
1027                                 err = -EMSGSIZE;
1028                                 goto error;
1029                         }
1030                         if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1031                                     offset, copy, skb->len, skb) < 0) {
1032                                 err = -EFAULT;
1033                                 goto error;
1034                         }
1035                         cork->off += copy;
1036                         skb_frag_size_add(frag, copy);
1037                         skb->len += copy;
1038                         skb->data_len += copy;
1039                         skb->truesize += copy;
1040                         atomic_add(copy, &sk->sk_wmem_alloc);
1041                 }
1042                 offset += copy;
1043                 length -= copy;
1044         }
1045
1046         return 0;
1047
1048 error:
1049         cork->length -= length;
1050         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1051         return err;
1052 }
1053
1054 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1055                          struct ipcm_cookie *ipc, struct rtable **rtp)
1056 {
1057         struct inet_sock *inet = inet_sk(sk);
1058         struct ip_options_rcu *opt;
1059         struct rtable *rt;
1060
1061         /*
1062          * setup for corking.
1063          */
1064         opt = ipc->opt;
1065         if (opt) {
1066                 if (cork->opt == NULL) {
1067                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1068                                             sk->sk_allocation);
1069                         if (unlikely(cork->opt == NULL))
1070                                 return -ENOBUFS;
1071                 }
1072                 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1073                 cork->flags |= IPCORK_OPT;
1074                 cork->addr = ipc->addr;
1075         }
1076         rt = *rtp;
1077         if (unlikely(!rt))
1078                 return -EFAULT;
1079         /*
1080          * We steal reference to this route, caller should not release it
1081          */
1082         *rtp = NULL;
1083         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1084                          rt->dst.dev->mtu : dst_mtu(&rt->dst);
1085         cork->dst = &rt->dst;
1086         cork->length = 0;
1087         cork->tx_flags = ipc->tx_flags;
1088         cork->page = NULL;
1089         cork->off = 0;
1090
1091         return 0;
1092 }
1093
1094 /*
1095  *      ip_append_data() and ip_append_page() can make one large IP datagram
1096  *      from many pieces of data. Each pieces will be holded on the socket
1097  *      until ip_push_pending_frames() is called. Each piece can be a page
1098  *      or non-page data.
1099  *
1100  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1101  *      this interface potentially.
1102  *
1103  *      LATER: length must be adjusted by pad at tail, when it is required.
1104  */
1105 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1106                    int getfrag(void *from, char *to, int offset, int len,
1107                                int odd, struct sk_buff *skb),
1108                    void *from, int length, int transhdrlen,
1109                    struct ipcm_cookie *ipc, struct rtable **rtp,
1110                    unsigned int flags)
1111 {
1112         struct inet_sock *inet = inet_sk(sk);
1113         int err;
1114
1115         if (flags&MSG_PROBE)
1116                 return 0;
1117
1118         if (skb_queue_empty(&sk->sk_write_queue)) {
1119                 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1120                 if (err)
1121                         return err;
1122         } else {
1123                 transhdrlen = 0;
1124         }
1125
1126         return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1127                                 from, length, transhdrlen, flags);
1128 }
1129
1130 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1131                        int offset, size_t size, int flags)
1132 {
1133         struct inet_sock *inet = inet_sk(sk);
1134         struct sk_buff *skb;
1135         struct rtable *rt;
1136         struct ip_options *opt = NULL;
1137         struct inet_cork *cork;
1138         int hh_len;
1139         int mtu;
1140         int len;
1141         int err;
1142         unsigned int maxfraglen, fragheaderlen, fraggap;
1143
1144         if (inet->hdrincl)
1145                 return -EPERM;
1146
1147         if (flags&MSG_PROBE)
1148                 return 0;
1149
1150         if (skb_queue_empty(&sk->sk_write_queue))
1151                 return -EINVAL;
1152
1153         cork = &inet->cork.base;
1154         rt = (struct rtable *)cork->dst;
1155         if (cork->flags & IPCORK_OPT)
1156                 opt = cork->opt;
1157
1158         if (!(rt->dst.dev->features&NETIF_F_SG))
1159                 return -EOPNOTSUPP;
1160
1161         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1162         mtu = cork->fragsize;
1163
1164         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1165         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1166
1167         if (cork->length + size > 0xFFFF - fragheaderlen) {
1168                 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1169                 return -EMSGSIZE;
1170         }
1171
1172         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1173                 return -EINVAL;
1174
1175         cork->length += size;
1176         if ((size + skb->len > mtu) &&
1177             (sk->sk_protocol == IPPROTO_UDP) &&
1178             (rt->dst.dev->features & NETIF_F_UFO)) {
1179                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1180                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1181         }
1182
1183
1184         while (size > 0) {
1185                 int i;
1186
1187                 if (skb_is_gso(skb))
1188                         len = size;
1189                 else {
1190
1191                         /* Check if the remaining data fits into current packet. */
1192                         len = mtu - skb->len;
1193                         if (len < size)
1194                                 len = maxfraglen - skb->len;
1195                 }
1196                 if (len <= 0) {
1197                         struct sk_buff *skb_prev;
1198                         int alloclen;
1199
1200                         skb_prev = skb;
1201                         fraggap = skb_prev->len - maxfraglen;
1202
1203                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1204                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1205                         if (unlikely(!skb)) {
1206                                 err = -ENOBUFS;
1207                                 goto error;
1208                         }
1209
1210                         /*
1211                          *      Fill in the control structures
1212                          */
1213                         skb->ip_summed = CHECKSUM_NONE;
1214                         skb->csum = 0;
1215                         skb_reserve(skb, hh_len);
1216
1217                         /*
1218                          *      Find where to start putting bytes.
1219                          */
1220                         skb_put(skb, fragheaderlen + fraggap);
1221                         skb_reset_network_header(skb);
1222                         skb->transport_header = (skb->network_header +
1223                                                  fragheaderlen);
1224                         if (fraggap) {
1225                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1226                                                                    maxfraglen,
1227                                                     skb_transport_header(skb),
1228                                                                    fraggap, 0);
1229                                 skb_prev->csum = csum_sub(skb_prev->csum,
1230                                                           skb->csum);
1231                                 pskb_trim_unique(skb_prev, maxfraglen);
1232                         }
1233
1234                         /*
1235                          * Put the packet on the pending queue.
1236                          */
1237                         __skb_queue_tail(&sk->sk_write_queue, skb);
1238                         continue;
1239                 }
1240
1241                 i = skb_shinfo(skb)->nr_frags;
1242                 if (len > size)
1243                         len = size;
1244                 if (skb_can_coalesce(skb, i, page, offset)) {
1245                         skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1246                 } else if (i < MAX_SKB_FRAGS) {
1247                         get_page(page);
1248                         skb_fill_page_desc(skb, i, page, offset, len);
1249                 } else {
1250                         err = -EMSGSIZE;
1251                         goto error;
1252                 }
1253
1254                 if (skb->ip_summed == CHECKSUM_NONE) {
1255                         __wsum csum;
1256                         csum = csum_page(page, offset, len);
1257                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1258                 }
1259
1260                 skb->len += len;
1261                 skb->data_len += len;
1262                 skb->truesize += len;
1263                 atomic_add(len, &sk->sk_wmem_alloc);
1264                 offset += len;
1265                 size -= len;
1266         }
1267         return 0;
1268
1269 error:
1270         cork->length -= size;
1271         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1272         return err;
1273 }
1274
1275 static void ip_cork_release(struct inet_cork *cork)
1276 {
1277         cork->flags &= ~IPCORK_OPT;
1278         kfree(cork->opt);
1279         cork->opt = NULL;
1280         dst_release(cork->dst);
1281         cork->dst = NULL;
1282 }
1283
1284 /*
1285  *      Combined all pending IP fragments on the socket as one IP datagram
1286  *      and push them out.
1287  */
1288 struct sk_buff *__ip_make_skb(struct sock *sk,
1289                               struct flowi4 *fl4,
1290                               struct sk_buff_head *queue,
1291                               struct inet_cork *cork)
1292 {
1293         struct sk_buff *skb, *tmp_skb;
1294         struct sk_buff **tail_skb;
1295         struct inet_sock *inet = inet_sk(sk);
1296         struct net *net = sock_net(sk);
1297         struct ip_options *opt = NULL;
1298         struct rtable *rt = (struct rtable *)cork->dst;
1299         struct iphdr *iph;
1300         __be16 df = 0;
1301         __u8 ttl;
1302
1303         if ((skb = __skb_dequeue(queue)) == NULL)
1304                 goto out;
1305         tail_skb = &(skb_shinfo(skb)->frag_list);
1306
1307         /* move skb->data to ip header from ext header */
1308         if (skb->data < skb_network_header(skb))
1309                 __skb_pull(skb, skb_network_offset(skb));
1310         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1311                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1312                 *tail_skb = tmp_skb;
1313                 tail_skb = &(tmp_skb->next);
1314                 skb->len += tmp_skb->len;
1315                 skb->data_len += tmp_skb->len;
1316                 skb->truesize += tmp_skb->truesize;
1317                 tmp_skb->destructor = NULL;
1318                 tmp_skb->sk = NULL;
1319         }
1320
1321         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1322          * to fragment the frame generated here. No matter, what transforms
1323          * how transforms change size of the packet, it will come out.
1324          */
1325         if (inet->pmtudisc < IP_PMTUDISC_DO)
1326                 skb->local_df = 1;
1327
1328         /* DF bit is set when we want to see DF on outgoing frames.
1329          * If local_df is set too, we still allow to fragment this frame
1330          * locally. */
1331         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1332             (skb->len <= dst_mtu(&rt->dst) &&
1333              ip_dont_fragment(sk, &rt->dst)))
1334                 df = htons(IP_DF);
1335
1336         if (cork->flags & IPCORK_OPT)
1337                 opt = cork->opt;
1338
1339         if (rt->rt_type == RTN_MULTICAST)
1340                 ttl = inet->mc_ttl;
1341         else
1342                 ttl = ip_select_ttl(inet, &rt->dst);
1343
1344         iph = ip_hdr(skb);
1345         iph->version = 4;
1346         iph->ihl = 5;
1347         iph->tos = inet->tos;
1348         iph->frag_off = df;
1349         ip_select_ident(skb, sk);
1350         iph->ttl = ttl;
1351         iph->protocol = sk->sk_protocol;
1352         ip_copy_addrs(iph, fl4);
1353
1354         if (opt) {
1355                 iph->ihl += opt->optlen>>2;
1356                 ip_options_build(skb, opt, cork->addr, rt, 0);
1357         }
1358
1359         skb->priority = sk->sk_priority;
1360         skb->mark = sk->sk_mark;
1361         /*
1362          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1363          * on dst refcount
1364          */
1365         cork->dst = NULL;
1366         skb_dst_set(skb, &rt->dst);
1367
1368         if (iph->protocol == IPPROTO_ICMP)
1369                 icmp_out_count(net, ((struct icmphdr *)
1370                         skb_transport_header(skb))->type);
1371
1372         ip_cork_release(cork);
1373 out:
1374         return skb;
1375 }
1376
1377 int ip_send_skb(struct sk_buff *skb)
1378 {
1379         struct net *net = sock_net(skb->sk);
1380         int err;
1381
1382         err = ip_local_out(skb);
1383         if (err) {
1384                 if (err > 0)
1385                         err = net_xmit_errno(err);
1386                 if (err)
1387                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1388         }
1389
1390         return err;
1391 }
1392
1393 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1394 {
1395         struct sk_buff *skb;
1396
1397         skb = ip_finish_skb(sk, fl4);
1398         if (!skb)
1399                 return 0;
1400
1401         /* Netfilter gets whole the not fragmented skb. */
1402         return ip_send_skb(skb);
1403 }
1404
1405 /*
1406  *      Throw away all pending data on the socket.
1407  */
1408 static void __ip_flush_pending_frames(struct sock *sk,
1409                                       struct sk_buff_head *queue,
1410                                       struct inet_cork *cork)
1411 {
1412         struct sk_buff *skb;
1413
1414         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1415                 kfree_skb(skb);
1416
1417         ip_cork_release(cork);
1418 }
1419
1420 void ip_flush_pending_frames(struct sock *sk)
1421 {
1422         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1423 }
1424
1425 struct sk_buff *ip_make_skb(struct sock *sk,
1426                             struct flowi4 *fl4,
1427                             int getfrag(void *from, char *to, int offset,
1428                                         int len, int odd, struct sk_buff *skb),
1429                             void *from, int length, int transhdrlen,
1430                             struct ipcm_cookie *ipc, struct rtable **rtp,
1431                             unsigned int flags)
1432 {
1433         struct inet_cork cork;
1434         struct sk_buff_head queue;
1435         int err;
1436
1437         if (flags & MSG_PROBE)
1438                 return NULL;
1439
1440         __skb_queue_head_init(&queue);
1441
1442         cork.flags = 0;
1443         cork.addr = 0;
1444         cork.opt = NULL;
1445         err = ip_setup_cork(sk, &cork, ipc, rtp);
1446         if (err)
1447                 return ERR_PTR(err);
1448
1449         err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1450                                from, length, transhdrlen, flags);
1451         if (err) {
1452                 __ip_flush_pending_frames(sk, &queue, &cork);
1453                 return ERR_PTR(err);
1454         }
1455
1456         return __ip_make_skb(sk, fl4, &queue, &cork);
1457 }
1458
1459 /*
1460  *      Fetch data from kernel space and fill in checksum if needed.
1461  */
1462 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1463                               int len, int odd, struct sk_buff *skb)
1464 {
1465         __wsum csum;
1466
1467         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1468         skb->csum = csum_block_add(skb->csum, csum, odd);
1469         return 0;
1470 }
1471
1472 /*
1473  *      Generic function to send a packet as reply to another packet.
1474  *      Used to send TCP resets so far. ICMP should use this function too.
1475  *
1476  *      Should run single threaded per socket because it uses the sock
1477  *      structure to pass arguments.
1478  */
1479 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1480                    const struct ip_reply_arg *arg, unsigned int len)
1481 {
1482         struct inet_sock *inet = inet_sk(sk);
1483         struct ip_options_data replyopts;
1484         struct ipcm_cookie ipc;
1485         struct flowi4 fl4;
1486         struct rtable *rt = skb_rtable(skb);
1487
1488         if (ip_options_echo(&replyopts.opt.opt, skb))
1489                 return;
1490
1491         ipc.addr = daddr;
1492         ipc.opt = NULL;
1493         ipc.tx_flags = 0;
1494
1495         if (replyopts.opt.opt.optlen) {
1496                 ipc.opt = &replyopts.opt;
1497
1498                 if (replyopts.opt.opt.srr)
1499                         daddr = replyopts.opt.opt.faddr;
1500         }
1501
1502         flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1503                            RT_TOS(arg->tos),
1504                            RT_SCOPE_UNIVERSE, sk->sk_protocol,
1505                            ip_reply_arg_flowi_flags(arg),
1506                            daddr, rt->rt_spec_dst,
1507                            tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1508         security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1509         rt = ip_route_output_key(sock_net(sk), &fl4);
1510         if (IS_ERR(rt))
1511                 return;
1512
1513         /* And let IP do all the hard work.
1514
1515            This chunk is not reenterable, hence spinlock.
1516            Note that it uses the fact, that this function is called
1517            with locally disabled BH and that sk cannot be already spinlocked.
1518          */
1519         bh_lock_sock(sk);
1520         inet->tos = arg->tos;
1521         sk->sk_priority = skb->priority;
1522         sk->sk_protocol = ip_hdr(skb)->protocol;
1523         sk->sk_bound_dev_if = arg->bound_dev_if;
1524         ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1525                        &ipc, &rt, MSG_DONTWAIT);
1526         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1527                 if (arg->csumoffset >= 0)
1528                         *((__sum16 *)skb_transport_header(skb) +
1529                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1530                                                                 arg->csum));
1531                 skb->ip_summed = CHECKSUM_NONE;
1532                 ip_push_pending_frames(sk, &fl4);
1533         }
1534
1535         bh_unlock_sock(sk);
1536
1537         ip_rt_put(rt);
1538 }
1539
1540 void __init ip_init(void)
1541 {
1542         ip_rt_init();
1543         inet_initpeers();
1544
1545 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1546         igmp_mc_proc_init();
1547 #endif
1548 }