net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 static int ip6_finish_output2(struct sk_buff *skb)
  87 {
  88         struct dst_entry *dst = skb_dst(skb);
  89         struct net_device *dev = dst->dev;
  90         struct neighbour *neigh;
  91         struct rt6_info *rt;
  92
  93         skb->protocol = htons(ETH_P_IPV6);
  94         skb->dev = dev;
  95
  96         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  97                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  98
  99                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 100                     ((mroute6_socket(dev_net(dev), skb) &&
 101                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 102                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 103                                          &ipv6_hdr(skb)->saddr))) {
 104                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 105
 106                         /* Do not check for IFF_ALLMULTI; multicast routing
 107                            is not supported in any case.
 108                          */
 109                         if (newskb)
 110                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 111                                         newskb, NULL, newskb->dev,
 112                                         dev_loopback_xmit);
 113
 114                         if (ipv6_hdr(skb)->hop_limit == 0) {
 115                                 IP6_INC_STATS(dev_net(dev), idev,
 116                                               IPSTATS_MIB_OUTDISCARDS);
 117                                 kfree_skb(skb);
 118                                 return 0;
 119                         }
 120                 }
 121
 122                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 123                                 skb->len);
 124         }
 125
 126         rt = (struct rt6_info *) dst;
 127         neigh = rt->n;
 128         if (neigh)
 129                 return dst_neigh_output(dst, neigh, skb);
 130
 131         IP6_INC_STATS_BH(dev_net(dst->dev),
 132                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb(skb);
 134         return -EINVAL;
 135 }
 136
 137 static int ip6_finish_output(struct sk_buff *skb)
 138 {
 139         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 140             dst_allfrag(skb_dst(skb)))
 141                 return ip6_fragment(skb, ip6_finish_output2);
 142         else
 143                 return ip6_finish_output2(skb);
 144 }
 145
 146 int ip6_output(struct sk_buff *skb)
 147 {
 148         struct net_device *dev = skb_dst(skb)->dev;
 149         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 150         if (unlikely(idev->cnf.disable_ipv6)) {
 151                 IP6_INC_STATS(dev_net(dev), idev,
 152                               IPSTATS_MIB_OUTDISCARDS);
 153                 kfree_skb(skb);
 154                 return 0;
 155         }
 156
 157         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 158                             ip6_finish_output,
 159                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 160 }
 161
 162 /*
 163  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 164  */
 165
 166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 167              struct ipv6_txoptions *opt, int tclass)
 168 {
 169         struct net *net = sock_net(sk);
 170         struct ipv6_pinfo *np = inet6_sk(sk);
 171         struct in6_addr *first_hop = &fl6->daddr;
 172         struct dst_entry *dst = skb_dst(skb);
 173         struct ipv6hdr *hdr;
 174         u8  proto = fl6->flowi6_proto;
 175         int seg_len = skb->len;
 176         int hlimit = -1;
 177         u32 mtu;
 178
 179         if (opt) {
 180                 unsigned int head_room;
 181
 182                 /* First: exthdrs may take lots of space (~8K for now)
 183                    MAX_HEADER is not enough.
 184                  */
 185                 head_room = opt->opt_nflen + opt->opt_flen;
 186                 seg_len += head_room;
 187                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 188
 189                 if (skb_headroom(skb) < head_room) {
 190                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 191                         if (skb2 == NULL) {
 192                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 193                                               IPSTATS_MIB_OUTDISCARDS);
 194                                 kfree_skb(skb);
 195                                 return -ENOBUFS;
 196                         }
 197                         consume_skb(skb);
 198                         skb = skb2;
 199                         skb_set_owner_w(skb, sk);
 200                 }
 201                 if (opt->opt_flen)
 202                         ipv6_push_frag_opts(skb, opt, &proto);
 203                 if (opt->opt_nflen)
 204                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 205         }
 206
 207         skb_push(skb, sizeof(struct ipv6hdr));
 208         skb_reset_network_header(skb);
 209         hdr = ipv6_hdr(skb);
 210
 211         /*
 212          *      Fill in the IPv6 header
 213          */
 214         if (np)
 215                 hlimit = np->hop_limit;
 216         if (hlimit < 0)
 217                 hlimit = ip6_dst_hoplimit(dst);
 218
 219         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 220
 221         hdr->payload_len = htons(seg_len);
 222         hdr->nexthdr = proto;
 223         hdr->hop_limit = hlimit;
 224
 225         hdr->saddr = fl6->saddr;
 226         hdr->daddr = *first_hop;
 227
 228         skb->priority = sk->sk_priority;
 229         skb->mark = sk->sk_mark;
 230
 231         mtu = dst_mtu(dst);
 232         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 233                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 234                               IPSTATS_MIB_OUT, skb->len);
 235                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 236                                dst->dev, dst_output);
 237         }
 238
 239         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
 240         skb->dev = dst->dev;
 241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 242         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 243         kfree_skb(skb);
 244         return -EMSGSIZE;
 245 }
 246
 247 EXPORT_SYMBOL(ip6_xmit);
 248
 249 /*
 250  *      To avoid extra problems ND packets are send through this
 251  *      routine. It's code duplication but I really want to avoid
 252  *      extra checks since ipv6_build_header is used by TCP (which
 253  *      is for us performance critical)
 254  */
 255
 256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 257                const struct in6_addr *saddr, const struct in6_addr *daddr,
 258                int proto, int len)
 259 {
 260         struct ipv6_pinfo *np = inet6_sk(sk);
 261         struct ipv6hdr *hdr;
 262
 263         skb->protocol = htons(ETH_P_IPV6);
 264         skb->dev = dev;
 265
 266         skb_reset_network_header(skb);
 267         skb_put(skb, sizeof(struct ipv6hdr));
 268         hdr = ipv6_hdr(skb);
 269
 270         *(__be32*)hdr = htonl(0x60000000);
 271
 272         hdr->payload_len = htons(len);
 273         hdr->nexthdr = proto;
 274         hdr->hop_limit = np->hop_limit;
 275
 276         hdr->saddr = *saddr;
 277         hdr->daddr = *daddr;
 278
 279         return 0;
 280 }
 281
 282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 283 {
 284         struct ip6_ra_chain *ra;
 285         struct sock *last = NULL;
 286
 287         read_lock(&ip6_ra_lock);
 288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 289                 struct sock *sk = ra->sk;
 290                 if (sk && ra->sel == sel &&
 291                     (!sk->sk_bound_dev_if ||
 292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 293                         if (last) {
 294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 295                                 if (skb2)
 296                                         rawv6_rcv(last, skb2);
 297                         }
 298                         last = sk;
 299                 }
 300         }
 301
 302         if (last) {
 303                 rawv6_rcv(last, skb);
 304                 read_unlock(&ip6_ra_lock);
 305                 return 1;
 306         }
 307         read_unlock(&ip6_ra_lock);
 308         return 0;
 309 }
 310
 311 static int ip6_forward_proxy_check(struct sk_buff *skb)
 312 {
 313         struct ipv6hdr *hdr = ipv6_hdr(skb);
 314         u8 nexthdr = hdr->nexthdr;
 315         __be16 frag_off;
 316         int offset;
 317
 318         if (ipv6_ext_hdr(nexthdr)) {
 319                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 320                 if (offset < 0)
 321                         return 0;
 322         } else
 323                 offset = sizeof(struct ipv6hdr);
 324
 325         if (nexthdr == IPPROTO_ICMPV6) {
 326                 struct icmp6hdr *icmp6;
 327
 328                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 329                                          offset + 1 - skb->data)))
 330                         return 0;
 331
 332                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 333
 334                 switch (icmp6->icmp6_type) {
 335                 case NDISC_ROUTER_SOLICITATION:
 336                 case NDISC_ROUTER_ADVERTISEMENT:
 337                 case NDISC_NEIGHBOUR_SOLICITATION:
 338                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 339                 case NDISC_REDIRECT:
 340                         /* For reaction involving unicast neighbor discovery
 341                          * message destined to the proxied address, pass it to
 342                          * input function.
 343                          */
 344                         return 1;
 345                 default:
 346                         break;
 347                 }
 348         }
 349
 350         /*
 351          * The proxying router can't forward traffic sent to a link-local
 352          * address, so signal the sender and discard the packet. This
 353          * behavior is clarified by the MIPv6 specification.
 354          */
 355         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 356                 dst_link_failure(skb);
 357                 return -1;
 358         }
 359
 360         return 0;
 361 }
 362
 363 static inline int ip6_forward_finish(struct sk_buff *skb)
 364 {
 365         return dst_output(skb);
 366 }
 367
 368 int ip6_forward(struct sk_buff *skb)
 369 {
 370         struct dst_entry *dst = skb_dst(skb);
 371         struct ipv6hdr *hdr = ipv6_hdr(skb);
 372         struct inet6_skb_parm *opt = IP6CB(skb);
 373         struct net *net = dev_net(dst->dev);
 374         u32 mtu;
 375
 376         if (net->ipv6.devconf_all->forwarding == 0)
 377                 goto error;
 378
 379         if (skb_warn_if_lro(skb))
 380                 goto drop;
 381
 382         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 383                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 384                 goto drop;
 385         }
 386
 387         if (skb->pkt_type != PACKET_HOST)
 388                 goto drop;
 389
 390         skb_forward_csum(skb);
 391
 392         /*
 393          *      We DO NOT make any processing on
 394          *      RA packets, pushing them to user level AS IS
 395          *      without ane WARRANTY that application will be able
 396          *      to interpret them. The reason is that we
 397          *      cannot make anything clever here.
 398          *
 399          *      We are not end-node, so that if packet contains
 400          *      AH/ESP, we cannot make anything.
 401          *      Defragmentation also would be mistake, RA packets
 402          *      cannot be fragmented, because there is no warranty
 403          *      that different fragments will go along one path. --ANK
 404          */
 405         if (opt->ra) {
 406                 u8 *ptr = skb_network_header(skb) + opt->ra;
 407                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 408                         return 0;
 409         }
 410
 411         /*
 412          *      check and decrement ttl
 413          */
 414         if (hdr->hop_limit <= 1) {
 415                 /* Force OUTPUT device used as source address */
 416                 skb->dev = dst->dev;
 417                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 418                 IP6_INC_STATS_BH(net,
 419                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 420
 421                 kfree_skb(skb);
 422                 return -ETIMEDOUT;
 423         }
 424
 425         /* XXX: idev->cnf.proxy_ndp? */
 426         if (net->ipv6.devconf_all->proxy_ndp &&
 427             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 428                 int proxied = ip6_forward_proxy_check(skb);
 429                 if (proxied > 0)
 430                         return ip6_input(skb);
 431                 else if (proxied < 0) {
 432                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 433                                       IPSTATS_MIB_INDISCARDS);
 434                         goto drop;
 435                 }
 436         }
 437
 438         if (!xfrm6_route_forward(skb)) {
 439                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 440                 goto drop;
 441         }
 442         dst = skb_dst(skb);
 443
 444         /* IPv6 specs say nothing about it, but it is clear that we cannot
 445            send redirects to source routed frames.
 446            We don't send redirects to frames decapsulated from IPsec.
 447          */
 448         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 449                 struct in6_addr *target = NULL;
 450                 struct inet_peer *peer;
 451                 struct rt6_info *rt;
 452
 453                 /*
 454                  *      incoming and outgoing devices are the same
 455                  *      send a redirect.
 456                  */
 457
 458                 rt = (struct rt6_info *) dst;
 459                 if (rt->rt6i_flags & RTF_GATEWAY)
 460                         target = &rt->rt6i_gateway;
 461                 else
 462                         target = &hdr->daddr;
 463
 464                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 465
 466                 /* Limit redirects both by destination (here)
 467                    and by source (inside ndisc_send_redirect)
 468                  */
 469                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 470                         ndisc_send_redirect(skb, target);
 471                 if (peer)
 472                         inet_putpeer(peer);
 473         } else {
 474                 int addrtype = ipv6_addr_type(&hdr->saddr);
 475
 476                 /* This check is security critical. */
 477                 if (addrtype == IPV6_ADDR_ANY ||
 478                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 479                         goto error;
 480                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 481                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 482                                     ICMPV6_NOT_NEIGHBOUR, 0);
 483                         goto error;
 484                 }
 485         }
 486
 487         mtu = dst_mtu(dst);
 488         if (mtu < IPV6_MIN_MTU)
 489                 mtu = IPV6_MIN_MTU;
 490
 491         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 492             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 493                 /* Again, force OUTPUT device used as source address */
 494                 skb->dev = dst->dev;
 495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 496                 IP6_INC_STATS_BH(net,
 497                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 498                 IP6_INC_STATS_BH(net,
 499                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 500                 kfree_skb(skb);
 501                 return -EMSGSIZE;
 502         }
 503
 504         if (skb_cow(skb, dst->dev->hard_header_len)) {
 505                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 506                 goto drop;
 507         }
 508
 509         hdr = ipv6_hdr(skb);
 510
 511         /* Mangling hops number delayed to point after skb COW */
 512
 513         hdr->hop_limit--;
 514
 515         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 516         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 517         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 518                        ip6_forward_finish);
 519
 520 error:
 521         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 522 drop:
 523         kfree_skb(skb);
 524         return -EINVAL;
 525 }
 526
 527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 528 {
 529         to->pkt_type = from->pkt_type;
 530         to->priority = from->priority;
 531         to->protocol = from->protocol;
 532         skb_dst_drop(to);
 533         skb_dst_set(to, dst_clone(skb_dst(from)));
 534         to->dev = from->dev;
 535         to->mark = from->mark;
 536
 537 #ifdef CONFIG_NET_SCHED
 538         to->tc_index = from->tc_index;
 539 #endif
 540         nf_copy(to, from);
 541 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 542         to->nf_trace = from->nf_trace;
 543 #endif
 544         skb_copy_secmark(to, from);
 545 }
 546
 547 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 548 {
 549         struct sk_buff *frag;
 550         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 551         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 552         struct ipv6hdr *tmp_hdr;
 553         struct frag_hdr *fh;
 554         unsigned int mtu, hlen, left, len;
 555         int hroom, troom;
 556         __be32 frag_id = 0;
 557         int ptr, offset = 0, err=0;
 558         u8 *prevhdr, nexthdr = 0;
 559         struct net *net = dev_net(skb_dst(skb)->dev);
 560
 561         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 562         nexthdr = *prevhdr;
 563
 564         mtu = ip6_skb_dst_mtu(skb);
 565
 566         /* We must not fragment if the socket is set to force MTU discovery
 567          * or if the skb it not generated by a local socket.
 568          */
 569         if (unlikely(!skb->local_df && skb->len > mtu) ||
 570                      (IP6CB(skb)->frag_max_size &&
 571                       IP6CB(skb)->frag_max_size > mtu)) {
 572                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 573                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 574
 575                 skb->dev = skb_dst(skb)->dev;
 576                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 577                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 578                               IPSTATS_MIB_FRAGFAILS);
 579                 kfree_skb(skb);
 580                 return -EMSGSIZE;
 581         }
 582
 583         if (np && np->frag_size < mtu) {
 584                 if (np->frag_size)
 585                         mtu = np->frag_size;
 586         }
 587         mtu -= hlen + sizeof(struct frag_hdr);
 588
 589         if (skb_has_frag_list(skb)) {
 590                 int first_len = skb_pagelen(skb);
 591                 struct sk_buff *frag2;
 592
 593                 if (first_len - hlen > mtu ||
 594                     ((first_len - hlen) & 7) ||
 595                     skb_cloned(skb))
 596                         goto slow_path;
 597
 598                 skb_walk_frags(skb, frag) {
 599                         /* Correct geometry. */
 600                         if (frag->len > mtu ||
 601                             ((frag->len & 7) && frag->next) ||
 602                             skb_headroom(frag) < hlen)
 603                                 goto slow_path_clean;
 604
 605                         /* Partially cloned skb? */
 606                         if (skb_shared(frag))
 607                                 goto slow_path_clean;
 608
 609                         BUG_ON(frag->sk);
 610                         if (skb->sk) {
 611                                 frag->sk = skb->sk;
 612                                 frag->destructor = sock_wfree;
 613                         }
 614                         skb->truesize -= frag->truesize;
 615                 }
 616
 617                 err = 0;
 618                 offset = 0;
 619                 frag = skb_shinfo(skb)->frag_list;
 620                 skb_frag_list_init(skb);
 621                 /* BUILD HEADER */
 622
 623                 *prevhdr = NEXTHDR_FRAGMENT;
 624                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 625                 if (!tmp_hdr) {
 626                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 627                                       IPSTATS_MIB_FRAGFAILS);
 628                         return -ENOMEM;
 629                 }
 630
 631                 __skb_pull(skb, hlen);
 632                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 633                 __skb_push(skb, hlen);
 634                 skb_reset_network_header(skb);
 635                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 636
 637                 ipv6_select_ident(fh, rt);
 638                 fh->nexthdr = nexthdr;
 639                 fh->reserved = 0;
 640                 fh->frag_off = htons(IP6_MF);
 641                 frag_id = fh->identification;
 642
 643                 first_len = skb_pagelen(skb);
 644                 skb->data_len = first_len - skb_headlen(skb);
 645                 skb->len = first_len;
 646                 ipv6_hdr(skb)->payload_len = htons(first_len -
 647                                                    sizeof(struct ipv6hdr));
 648
 649                 dst_hold(&rt->dst);
 650
 651                 for (;;) {
 652                         /* Prepare header of the next frame,
 653                          * before previous one went down. */
 654                         if (frag) {
 655                                 frag->ip_summed = CHECKSUM_NONE;
 656                                 skb_reset_transport_header(frag);
 657                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 658                                 __skb_push(frag, hlen);
 659                                 skb_reset_network_header(frag);
 660                                 memcpy(skb_network_header(frag), tmp_hdr,
 661                                        hlen);
 662                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 663                                 fh->nexthdr = nexthdr;
 664                                 fh->reserved = 0;
 665                                 fh->frag_off = htons(offset);
 666                                 if (frag->next != NULL)
 667                                         fh->frag_off |= htons(IP6_MF);
 668                                 fh->identification = frag_id;
 669                                 ipv6_hdr(frag)->payload_len =
 670                                                 htons(frag->len -
 671                                                       sizeof(struct ipv6hdr));
 672                                 ip6_copy_metadata(frag, skb);
 673                         }
 674
 675                         err = output(skb);
 676                         if(!err)
 677                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 678                                               IPSTATS_MIB_FRAGCREATES);
 679
 680                         if (err || !frag)
 681                                 break;
 682
 683                         skb = frag;
 684                         frag = skb->next;
 685                         skb->next = NULL;
 686                 }
 687
 688                 kfree(tmp_hdr);
 689
 690                 if (err == 0) {
 691                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 692                                       IPSTATS_MIB_FRAGOKS);
 693                         ip6_rt_put(rt);
 694                         return 0;
 695                 }
 696
 697                 while (frag) {
 698                         skb = frag->next;
 699                         kfree_skb(frag);
 700                         frag = skb;
 701                 }
 702
 703                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 704                               IPSTATS_MIB_FRAGFAILS);
 705                 ip6_rt_put(rt);
 706                 return err;
 707
 708 slow_path_clean:
 709                 skb_walk_frags(skb, frag2) {
 710                         if (frag2 == frag)
 711                                 break;
 712                         frag2->sk = NULL;
 713                         frag2->destructor = NULL;
 714                         skb->truesize += frag2->truesize;
 715                 }
 716         }
 717
 718 slow_path:
 719         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 720             skb_checksum_help(skb))
 721                 goto fail;
 722
 723         left = skb->len - hlen;         /* Space per frame */
 724         ptr = hlen;                     /* Where to start from */
 725
 726         /*
 727          *      Fragment the datagram.
 728          */
 729
 730         *prevhdr = NEXTHDR_FRAGMENT;
 731         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 732         troom = rt->dst.dev->needed_tailroom;
 733
 734         /*
 735          *      Keep copying data until we run out.
 736          */
 737         while(left > 0) {
 738                 len = left;
 739                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 740                 if (len > mtu)
 741                         len = mtu;
 742                 /* IF: we are not sending up to and including the packet end
 743                    then align the next start on an eight byte boundary */
 744                 if (len < left) {
 745                         len &= ~7;
 746                 }
 747                 /*
 748                  *      Allocate buffer.
 749                  */
 750
 751                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 752                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 753                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 754                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 755                                       IPSTATS_MIB_FRAGFAILS);
 756                         err = -ENOMEM;
 757                         goto fail;
 758                 }
 759
 760                 /*
 761                  *      Set up data on packet
 762                  */
 763
 764                 ip6_copy_metadata(frag, skb);
 765                 skb_reserve(frag, hroom);
 766                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 767                 skb_reset_network_header(frag);
 768                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 769                 frag->transport_header = (frag->network_header + hlen +
 770                                           sizeof(struct frag_hdr));
 771
 772                 /*
 773                  *      Charge the memory for the fragment to any owner
 774                  *      it might possess
 775                  */
 776                 if (skb->sk)
 777                         skb_set_owner_w(frag, skb->sk);
 778
 779                 /*
 780                  *      Copy the packet header into the new buffer.
 781                  */
 782                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 783
 784                 /*
 785                  *      Build fragment header.
 786                  */
 787                 fh->nexthdr = nexthdr;
 788                 fh->reserved = 0;
 789                 if (!frag_id) {
 790                         ipv6_select_ident(fh, rt);
 791                         frag_id = fh->identification;
 792                 } else
 793                         fh->identification = frag_id;
 794
 795                 /*
 796                  *      Copy a block of the IP datagram.
 797                  */
 798                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 799                         BUG();
 800                 left -= len;
 801
 802                 fh->frag_off = htons(offset);
 803                 if (left > 0)
 804                         fh->frag_off |= htons(IP6_MF);
 805                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 806                                                     sizeof(struct ipv6hdr));
 807
 808                 ptr += len;
 809                 offset += len;
 810
 811                 /*
 812                  *      Put this fragment into the sending queue.
 813                  */
 814                 err = output(frag);
 815                 if (err)
 816                         goto fail;
 817
 818                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 819                               IPSTATS_MIB_FRAGCREATES);
 820         }
 821         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 822                       IPSTATS_MIB_FRAGOKS);
 823         consume_skb(skb);
 824         return err;
 825
 826 fail:
 827         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 828                       IPSTATS_MIB_FRAGFAILS);
 829         kfree_skb(skb);
 830         return err;
 831 }
 832
 833 static inline int ip6_rt_check(const struct rt6key *rt_key,
 834                                const struct in6_addr *fl_addr,
 835                                const struct in6_addr *addr_cache)
 836 {
 837         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 838                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 839 }
 840
 841 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 842                                           struct dst_entry *dst,
 843                                           const struct flowi6 *fl6)
 844 {
 845         struct ipv6_pinfo *np = inet6_sk(sk);
 846         struct rt6_info *rt = (struct rt6_info *)dst;
 847
 848         if (!dst)
 849                 goto out;
 850
 851         /* Yes, checking route validity in not connected
 852          * case is not very simple. Take into account,
 853          * that we do not support routing by source, TOS,
 854          * and MSG_DONTROUTE            --ANK (980726)
 855          *
 856          * 1. ip6_rt_check(): If route was host route,
 857          *    check that cached destination is current.
 858          *    If it is network route, we still may
 859          *    check its validity using saved pointer
 860          *    to the last used address: daddr_cache.
 861          *    We do not want to save whole address now,
 862          *    (because main consumer of this service
 863          *    is tcp, which has not this problem),
 864          *    so that the last trick works only on connected
 865          *    sockets.
 866          * 2. oif also should be the same.
 867          */
 868         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 869 #ifdef CONFIG_IPV6_SUBTREES
 870             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 871 #endif
 872             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 873                 dst_release(dst);
 874                 dst = NULL;
 875         }
 876
 877 out:
 878         return dst;
 879 }
 880
 881 static int ip6_dst_lookup_tail(struct sock *sk,
 882                                struct dst_entry **dst, struct flowi6 *fl6)
 883 {
 884         struct net *net = sock_net(sk);
 885 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 886         struct neighbour *n;
 887         struct rt6_info *rt;
 888 #endif
 889         int err;
 890
 891         if (*dst == NULL)
 892                 *dst = ip6_route_output(net, sk, fl6);
 893
 894         if ((err = (*dst)->error))
 895                 goto out_err_release;
 896
 897         if (ipv6_addr_any(&fl6->saddr)) {
 898                 struct rt6_info *rt = (struct rt6_info *) *dst;
 899                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 900                                           sk ? inet6_sk(sk)->srcprefs : 0,
 901                                           &fl6->saddr);
 902                 if (err)
 903                         goto out_err_release;
 904         }
 905
 906 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 907         /*
 908          * Here if the dst entry we've looked up
 909          * has a neighbour entry that is in the INCOMPLETE
 910          * state and the src address from the flow is
 911          * marked as OPTIMISTIC, we release the found
 912          * dst entry and replace it instead with the
 913          * dst entry of the nexthop router
 914          */
 915         rt = (struct rt6_info *) *dst;
 916         n = rt->n;
 917         if (n && !(n->nud_state & NUD_VALID)) {
 918                 struct inet6_ifaddr *ifp;
 919                 struct flowi6 fl_gw6;
 920                 int redirect;
 921
 922                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 923                                       (*dst)->dev, 1);
 924
 925                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 926                 if (ifp)
 927                         in6_ifa_put(ifp);
 928
 929                 if (redirect) {
 930                         /*
 931                          * We need to get the dst entry for the
 932                          * default router instead
 933                          */
 934                         dst_release(*dst);
 935                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 936                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 937                         *dst = ip6_route_output(net, sk, &fl_gw6);
 938                         if ((err = (*dst)->error))
 939                                 goto out_err_release;
 940                 }
 941         }
 942 #endif
 943
 944         return 0;
 945
 946 out_err_release:
 947         if (err == -ENETUNREACH)
 948                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 949         dst_release(*dst);
 950         *dst = NULL;
 951         return err;
 952 }
 953
 954 /**
 955  *      ip6_dst_lookup - perform route lookup on flow
 956  *      @sk: socket which provides route info
 957  *      @dst: pointer to dst_entry * for result
 958  *      @fl6: flow to lookup
 959  *
 960  *      This function performs a route lookup on the given flow.
 961  *
 962  *      It returns zero on success, or a standard errno code on error.
 963  */
 964 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 965 {
 966         *dst = NULL;
 967         return ip6_dst_lookup_tail(sk, dst, fl6);
 968 }
 969 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 970
 971 /**
 972  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 973  *      @sk: socket which provides route info
 974  *      @fl6: flow to lookup
 975  *      @final_dst: final destination address for ipsec lookup
 976  *      @can_sleep: we are in a sleepable context
 977  *
 978  *      This function performs a route lookup on the given flow.
 979  *
 980  *      It returns a valid dst pointer on success, or a pointer encoded
 981  *      error code.
 982  */
 983 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 984                                       const struct in6_addr *final_dst,
 985                                       bool can_sleep)
 986 {
 987         struct dst_entry *dst = NULL;
 988         int err;
 989
 990         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 991         if (err)
 992                 return ERR_PTR(err);
 993         if (final_dst)
 994                 fl6->daddr = *final_dst;
 995         if (can_sleep)
 996                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 997
 998         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 999 }
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1001
1002 /**
1003  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1004  *      @sk: socket which provides the dst cache and route info
1005  *      @fl6: flow to lookup
1006  *      @final_dst: final destination address for ipsec lookup
1007  *      @can_sleep: we are in a sleepable context
1008  *
1009  *      This function performs a route lookup on the given flow with the
1010  *      possibility of using the cached route in the socket if it is valid.
1011  *      It will take the socket dst lock when operating on the dst cache.
1012  *      As a result, this function can only be used in process context.
1013  *
1014  *      It returns a valid dst pointer on success, or a pointer encoded
1015  *      error code.
1016  */
1017 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1018                                          const struct in6_addr *final_dst,
1019                                          bool can_sleep)
1020 {
1021         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022         int err;
1023
1024         dst = ip6_sk_dst_check(sk, dst, fl6);
1025
1026         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1027         if (err)
1028                 return ERR_PTR(err);
1029         if (final_dst)
1030                 fl6->daddr = *final_dst;
1031         if (can_sleep)
1032                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1033
1034         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1035 }
1036 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1037
1038 static inline int ip6_ufo_append_data(struct sock *sk,
1039                         int getfrag(void *from, char *to, int offset, int len,
1040                         int odd, struct sk_buff *skb),
1041                         void *from, int length, int hh_len, int fragheaderlen,
1042                         int transhdrlen, int mtu,unsigned int flags,
1043                         struct rt6_info *rt)
1044
1045 {
1046         struct sk_buff *skb;
1047         int err;
1048
1049         /* There is support for UDP large send offload by network
1050          * device, so create one single skb packet containing complete
1051          * udp datagram
1052          */
1053         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1054                 skb = sock_alloc_send_skb(sk,
1055                         hh_len + fragheaderlen + transhdrlen + 20,
1056                         (flags & MSG_DONTWAIT), &err);
1057                 if (skb == NULL)
1058                         return err;
1059
1060                 /* reserve space for Hardware header */
1061                 skb_reserve(skb, hh_len);
1062
1063                 /* create space for UDP/IP header */
1064                 skb_put(skb,fragheaderlen + transhdrlen);
1065
1066                 /* initialize network header pointer */
1067                 skb_reset_network_header(skb);
1068
1069                 /* initialize protocol header pointer */
1070                 skb->transport_header = skb->network_header + fragheaderlen;
1071
1072                 skb->ip_summed = CHECKSUM_PARTIAL;
1073                 skb->csum = 0;
1074         }
1075
1076         err = skb_append_datato_frags(sk,skb, getfrag, from,
1077                                       (length - transhdrlen));
1078         if (!err) {
1079                 struct frag_hdr fhdr;
1080
1081                 /* Specify the length of each IPv6 datagram fragment.
1082                  * It has to be a multiple of 8.
1083                  */
1084                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1085                                              sizeof(struct frag_hdr)) & ~7;
1086                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1087                 ipv6_select_ident(&fhdr, rt);
1088                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1089                 __skb_queue_tail(&sk->sk_write_queue, skb);
1090
1091                 return 0;
1092         }
1093         /* There is not enough support do UPD LSO,
1094          * so follow normal path
1095          */
1096         kfree_skb(skb);
1097
1098         return err;
1099 }
1100
1101 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1102                                                gfp_t gfp)
1103 {
1104         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105 }
1106
1107 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1108                                                 gfp_t gfp)
1109 {
1110         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1111 }
1112
1113 static void ip6_append_data_mtu(int *mtu,
1114                                 int *maxfraglen,
1115                                 unsigned int fragheaderlen,
1116                                 struct sk_buff *skb,
1117                                 struct rt6_info *rt)
1118 {
1119         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1120                 if (skb == NULL) {
1121                         /* first fragment, reserve header_len */
1122                         *mtu = *mtu - rt->dst.header_len;
1123
1124                 } else {
1125                         /*
1126                          * this fragment is not first, the headers
1127                          * space is regarded as data space.
1128                          */
1129                         *mtu = dst_mtu(rt->dst.path);
1130                 }
1131                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1132                               + fragheaderlen - sizeof(struct frag_hdr);
1133         }
1134 }
1135
1136 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1137         int offset, int len, int odd, struct sk_buff *skb),
1138         void *from, int length, int transhdrlen,
1139         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1140         struct rt6_info *rt, unsigned int flags, int dontfrag)
1141 {
1142         struct inet_sock *inet = inet_sk(sk);
1143         struct ipv6_pinfo *np = inet6_sk(sk);
1144         struct inet_cork *cork;
1145         struct sk_buff *skb, *skb_prev = NULL;
1146         unsigned int maxfraglen, fragheaderlen;
1147         int exthdrlen;
1148         int dst_exthdrlen;
1149         int hh_len;
1150         int mtu;
1151         int copy;
1152         int err;
1153         int offset = 0;
1154         __u8 tx_flags = 0;
1155
1156         if (flags&MSG_PROBE)
1157                 return 0;
1158         cork = &inet->cork.base;
1159         if (skb_queue_empty(&sk->sk_write_queue)) {
1160                 /*
1161                  * setup for corking
1162                  */
1163                 if (opt) {
1164                         if (WARN_ON(np->cork.opt))
1165                                 return -EINVAL;
1166
1167                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1168                         if (unlikely(np->cork.opt == NULL))
1169                                 return -ENOBUFS;
1170
1171                         np->cork.opt->tot_len = opt->tot_len;
1172                         np->cork.opt->opt_flen = opt->opt_flen;
1173                         np->cork.opt->opt_nflen = opt->opt_nflen;
1174
1175                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1176                                                             sk->sk_allocation);
1177                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1178                                 return -ENOBUFS;
1179
1180                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1181                                                             sk->sk_allocation);
1182                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1183                                 return -ENOBUFS;
1184
1185                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1186                                                            sk->sk_allocation);
1187                         if (opt->hopopt && !np->cork.opt->hopopt)
1188                                 return -ENOBUFS;
1189
1190                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1191                                                             sk->sk_allocation);
1192                         if (opt->srcrt && !np->cork.opt->srcrt)
1193                                 return -ENOBUFS;
1194
1195                         /* need source address above miyazawa*/
1196                 }
1197                 dst_hold(&rt->dst);
1198                 cork->dst = &rt->dst;
1199                 inet->cork.fl.u.ip6 = *fl6;
1200                 np->cork.hop_limit = hlimit;
1201                 np->cork.tclass = tclass;
1202                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1203                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1204                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1205                 else
1206                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1207                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1208                 if (np->frag_size < mtu) {
1209                         if (np->frag_size)
1210                                 mtu = np->frag_size;
1211                 }
1212                 cork->fragsize = mtu;
1213                 if (dst_allfrag(rt->dst.path))
1214                         cork->flags |= IPCORK_ALLFRAG;
1215                 cork->length = 0;
1216                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1217                 length += exthdrlen;
1218                 transhdrlen += exthdrlen;
1219                 dst_exthdrlen = rt->dst.header_len;
1220         } else {
1221                 rt = (struct rt6_info *)cork->dst;
1222                 fl6 = &inet->cork.fl.u.ip6;
1223                 opt = np->cork.opt;
1224                 transhdrlen = 0;
1225                 exthdrlen = 0;
1226                 dst_exthdrlen = 0;
1227                 mtu = cork->fragsize;
1228         }
1229
1230         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1231
1232         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1233                         (opt ? opt->opt_nflen : 0);
1234         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1235
1236         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1237                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1238                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1239                         return -EMSGSIZE;
1240                 }
1241         }
1242
1243         /* For UDP, check if TX timestamp is enabled */
1244         if (sk->sk_type == SOCK_DGRAM) {
1245                 err = sock_tx_timestamp(sk, &tx_flags);
1246                 if (err)
1247                         goto error;
1248         }
1249
1250         /*
1251          * Let's try using as much space as possible.
1252          * Use MTU if total length of the message fits into the MTU.
1253          * Otherwise, we need to reserve fragment header and
1254          * fragment alignment (= 8-15 octects, in total).
1255          *
1256          * Note that we may need to "move" the data from the tail of
1257          * of the buffer to the new fragment when we split
1258          * the message.
1259          *
1260          * FIXME: It may be fragmented into multiple chunks
1261          *        at once if non-fragmentable extension headers
1262          *        are too large.
1263          * --yoshfuji
1264          */
1265
1266         cork->length += length;
1267         if (length > mtu) {
1268                 int proto = sk->sk_protocol;
1269                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1270                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1271                         return -EMSGSIZE;
1272                 }
1273
1274                 if (proto == IPPROTO_UDP &&
1275                     (rt->dst.dev->features & NETIF_F_UFO)) {
1276
1277                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1278                                                   hh_len, fragheaderlen,
1279                                                   transhdrlen, mtu, flags, rt);
1280                         if (err)
1281                                 goto error;
1282                         return 0;
1283                 }
1284         }
1285
1286         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1287                 goto alloc_new_skb;
1288
1289         while (length > 0) {
1290                 /* Check if the remaining data fits into current packet. */
1291                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1292                 if (copy < length)
1293                         copy = maxfraglen - skb->len;
1294
1295                 if (copy <= 0) {
1296                         char *data;
1297                         unsigned int datalen;
1298                         unsigned int fraglen;
1299                         unsigned int fraggap;
1300                         unsigned int alloclen;
1301 alloc_new_skb:
1302                         /* There's no room in the current skb */
1303                         if (skb)
1304                                 fraggap = skb->len - maxfraglen;
1305                         else
1306                                 fraggap = 0;
1307                         /* update mtu and maxfraglen if necessary */
1308                         if (skb == NULL || skb_prev == NULL)
1309                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1310                                                     fragheaderlen, skb, rt);
1311
1312                         skb_prev = skb;
1313
1314                         /*
1315                          * If remaining data exceeds the mtu,
1316                          * we know we need more fragment(s).
1317                          */
1318                         datalen = length + fraggap;
1319
1320                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1321                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1322                         if ((flags & MSG_MORE) &&
1323                             !(rt->dst.dev->features&NETIF_F_SG))
1324                                 alloclen = mtu;
1325                         else
1326                                 alloclen = datalen + fragheaderlen;
1327
1328                         alloclen += dst_exthdrlen;
1329
1330                         if (datalen != length + fraggap) {
1331                                 /*
1332                                  * this is not the last fragment, the trailer
1333                                  * space is regarded as data space.
1334                                  */
1335                                 datalen += rt->dst.trailer_len;
1336                         }
1337
1338                         alloclen += rt->dst.trailer_len;
1339                         fraglen = datalen + fragheaderlen;
1340
1341                         /*
1342                          * We just reserve space for fragment header.
1343                          * Note: this may be overallocation if the message
1344                          * (without MSG_MORE) fits into the MTU.
1345                          */
1346                         alloclen += sizeof(struct frag_hdr);
1347
1348                         if (transhdrlen) {
1349                                 skb = sock_alloc_send_skb(sk,
1350                                                 alloclen + hh_len,
1351                                                 (flags & MSG_DONTWAIT), &err);
1352                         } else {
1353                                 skb = NULL;
1354                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1355                                     2 * sk->sk_sndbuf)
1356                                         skb = sock_wmalloc(sk,
1357                                                            alloclen + hh_len, 1,
1358                                                            sk->sk_allocation);
1359                                 if (unlikely(skb == NULL))
1360                                         err = -ENOBUFS;
1361                                 else {
1362                                         /* Only the initial fragment
1363                                          * is time stamped.
1364                                          */
1365                                         tx_flags = 0;
1366                                 }
1367                         }
1368                         if (skb == NULL)
1369                                 goto error;
1370                         /*
1371                          *      Fill in the control structures
1372                          */
1373                         skb->ip_summed = CHECKSUM_NONE;
1374                         skb->csum = 0;
1375                         /* reserve for fragmentation and ipsec header */
1376                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1377                                     dst_exthdrlen);
1378
1379                         if (sk->sk_type == SOCK_DGRAM)
1380                                 skb_shinfo(skb)->tx_flags = tx_flags;
1381
1382                         /*
1383                          *      Find where to start putting bytes
1384                          */
1385                         data = skb_put(skb, fraglen);
1386                         skb_set_network_header(skb, exthdrlen);
1387                         data += fragheaderlen;
1388                         skb->transport_header = (skb->network_header +
1389                                                  fragheaderlen);
1390                         if (fraggap) {
1391                                 skb->csum = skb_copy_and_csum_bits(
1392                                         skb_prev, maxfraglen,
1393                                         data + transhdrlen, fraggap, 0);
1394                                 skb_prev->csum = csum_sub(skb_prev->csum,
1395                                                           skb->csum);
1396                                 data += fraggap;
1397                                 pskb_trim_unique(skb_prev, maxfraglen);
1398                         }
1399                         copy = datalen - transhdrlen - fraggap;
1400
1401                         if (copy < 0) {
1402                                 err = -EINVAL;
1403                                 kfree_skb(skb);
1404                                 goto error;
1405                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1406                                 err = -EFAULT;
1407                                 kfree_skb(skb);
1408                                 goto error;
1409                         }
1410
1411                         offset += copy;
1412                         length -= datalen - fraggap;
1413                         transhdrlen = 0;
1414                         exthdrlen = 0;
1415                         dst_exthdrlen = 0;
1416
1417                         /*
1418                          * Put the packet on the pending queue
1419                          */
1420                         __skb_queue_tail(&sk->sk_write_queue, skb);
1421                         continue;
1422                 }
1423
1424                 if (copy > length)
1425                         copy = length;
1426
1427                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1428                         unsigned int off;
1429
1430                         off = skb->len;
1431                         if (getfrag(from, skb_put(skb, copy),
1432                                                 offset, copy, off, skb) < 0) {
1433                                 __skb_trim(skb, off);
1434                                 err = -EFAULT;
1435                                 goto error;
1436                         }
1437                 } else {
1438                         int i = skb_shinfo(skb)->nr_frags;
1439                         struct page_frag *pfrag = sk_page_frag(sk);
1440
1441                         err = -ENOMEM;
1442                         if (!sk_page_frag_refill(sk, pfrag))
1443                                 goto error;
1444
1445                         if (!skb_can_coalesce(skb, i, pfrag->page,
1446                                               pfrag->offset)) {
1447                                 err = -EMSGSIZE;
1448                                 if (i == MAX_SKB_FRAGS)
1449                                         goto error;
1450
1451                                 __skb_fill_page_desc(skb, i, pfrag->page,
1452                                                      pfrag->offset, 0);
1453                                 skb_shinfo(skb)->nr_frags = ++i;
1454                                 get_page(pfrag->page);
1455                         }
1456                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1457                         if (getfrag(from,
1458                                     page_address(pfrag->page) + pfrag->offset,
1459                                     offset, copy, skb->len, skb) < 0)
1460                                 goto error_efault;
1461
1462                         pfrag->offset += copy;
1463                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1464                         skb->len += copy;
1465                         skb->data_len += copy;
1466                         skb->truesize += copy;
1467                         atomic_add(copy, &sk->sk_wmem_alloc);
1468                 }
1469                 offset += copy;
1470                 length -= copy;
1471         }
1472
1473         return 0;
1474
1475 error_efault:
1476         err = -EFAULT;
1477 error:
1478         cork->length -= length;
1479         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1480         return err;
1481 }
1482 EXPORT_SYMBOL_GPL(ip6_append_data);
1483
1484 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1485 {
1486         if (np->cork.opt) {
1487                 kfree(np->cork.opt->dst0opt);
1488                 kfree(np->cork.opt->dst1opt);
1489                 kfree(np->cork.opt->hopopt);
1490                 kfree(np->cork.opt->srcrt);
1491                 kfree(np->cork.opt);
1492                 np->cork.opt = NULL;
1493         }
1494
1495         if (inet->cork.base.dst) {
1496                 dst_release(inet->cork.base.dst);
1497                 inet->cork.base.dst = NULL;
1498                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1499         }
1500         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1501 }
1502
1503 int ip6_push_pending_frames(struct sock *sk)
1504 {
1505         struct sk_buff *skb, *tmp_skb;
1506         struct sk_buff **tail_skb;
1507         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1508         struct inet_sock *inet = inet_sk(sk);
1509         struct ipv6_pinfo *np = inet6_sk(sk);
1510         struct net *net = sock_net(sk);
1511         struct ipv6hdr *hdr;
1512         struct ipv6_txoptions *opt = np->cork.opt;
1513         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1514         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1515         unsigned char proto = fl6->flowi6_proto;
1516         int err = 0;
1517
1518         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1519                 goto out;
1520         tail_skb = &(skb_shinfo(skb)->frag_list);
1521
1522         /* move skb->data to ip header from ext header */
1523         if (skb->data < skb_network_header(skb))
1524                 __skb_pull(skb, skb_network_offset(skb));
1525         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1526                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1527                 *tail_skb = tmp_skb;
1528                 tail_skb = &(tmp_skb->next);
1529                 skb->len += tmp_skb->len;
1530                 skb->data_len += tmp_skb->len;
1531                 skb->truesize += tmp_skb->truesize;
1532                 tmp_skb->destructor = NULL;
1533                 tmp_skb->sk = NULL;
1534         }
1535
1536         /* Allow local fragmentation. */
1537         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1538                 skb->local_df = 1;
1539
1540         *final_dst = fl6->daddr;
1541         __skb_pull(skb, skb_network_header_len(skb));
1542         if (opt && opt->opt_flen)
1543                 ipv6_push_frag_opts(skb, opt, &proto);
1544         if (opt && opt->opt_nflen)
1545                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1546
1547         skb_push(skb, sizeof(struct ipv6hdr));
1548         skb_reset_network_header(skb);
1549         hdr = ipv6_hdr(skb);
1550
1551         *(__be32*)hdr = fl6->flowlabel |
1552                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1553
1554         hdr->hop_limit = np->cork.hop_limit;
1555         hdr->nexthdr = proto;
1556         hdr->saddr = fl6->saddr;
1557         hdr->daddr = *final_dst;
1558
1559         skb->priority = sk->sk_priority;
1560         skb->mark = sk->sk_mark;
1561
1562         skb_dst_set(skb, dst_clone(&rt->dst));
1563         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1564         if (proto == IPPROTO_ICMPV6) {
1565                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1566
1567                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1568                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1569         }
1570
1571         err = ip6_local_out(skb);
1572         if (err) {
1573                 if (err > 0)
1574                         err = net_xmit_errno(err);
1575                 if (err)
1576                         goto error;
1577         }
1578
1579 out:
1580         ip6_cork_release(inet, np);
1581         return err;
1582 error:
1583         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1584         goto out;
1585 }
1586 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1587
1588 void ip6_flush_pending_frames(struct sock *sk)
1589 {
1590         struct sk_buff *skb;
1591
1592         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1593                 if (skb_dst(skb))
1594                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1595                                       IPSTATS_MIB_OUTDISCARDS);
1596                 kfree_skb(skb);
1597         }
1598
1599         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);