net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 static int ip6_finish_output2(struct sk_buff *skb)
  87 {
  88         struct dst_entry *dst = skb_dst(skb);
  89         struct net_device *dev = dst->dev;
  90         struct neighbour *neigh;
  91         struct rt6_info *rt;
  92
  93         skb->protocol = htons(ETH_P_IPV6);
  94         skb->dev = dev;
  95
  96         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  97                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  98
  99                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 100                     ((mroute6_socket(dev_net(dev), skb) &&
 101                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 102                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 103                                          &ipv6_hdr(skb)->saddr))) {
 104                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 105
 106                         /* Do not check for IFF_ALLMULTI; multicast routing
 107                            is not supported in any case.
 108                          */
 109                         if (newskb)
 110                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 111                                         newskb, NULL, newskb->dev,
 112                                         dev_loopback_xmit);
 113
 114                         if (ipv6_hdr(skb)->hop_limit == 0) {
 115                                 IP6_INC_STATS(dev_net(dev), idev,
 116                                               IPSTATS_MIB_OUTDISCARDS);
 117                                 kfree_skb(skb);
 118                                 return 0;
 119                         }
 120                 }
 121
 122                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 123                                 skb->len);
 124         }
 125
 126         rcu_read_lock();
 127         rt = (struct rt6_info *) dst;
 128         neigh = rt->n;
 129         if (neigh) {
 130                 int res = dst_neigh_output(dst, neigh, skb);
 131
 132                 rcu_read_unlock();
 133                 return res;
 134         }
 135         rcu_read_unlock();
 136         IP6_INC_STATS_BH(dev_net(dst->dev),
 137                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 138         kfree_skb(skb);
 139         return -EINVAL;
 140 }
 141
 142 static int ip6_finish_output(struct sk_buff *skb)
 143 {
 144         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 145             dst_allfrag(skb_dst(skb)))
 146                 return ip6_fragment(skb, ip6_finish_output2);
 147         else
 148                 return ip6_finish_output2(skb);
 149 }
 150
 151 int ip6_output(struct sk_buff *skb)
 152 {
 153         struct net_device *dev = skb_dst(skb)->dev;
 154         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 155         if (unlikely(idev->cnf.disable_ipv6)) {
 156                 IP6_INC_STATS(dev_net(dev), idev,
 157                               IPSTATS_MIB_OUTDISCARDS);
 158                 kfree_skb(skb);
 159                 return 0;
 160         }
 161
 162         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 163                             ip6_finish_output,
 164                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 165 }
 166
 167 /*
 168  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 169  */
 170
 171 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 172              struct ipv6_txoptions *opt, int tclass)
 173 {
 174         struct net *net = sock_net(sk);
 175         struct ipv6_pinfo *np = inet6_sk(sk);
 176         struct in6_addr *first_hop = &fl6->daddr;
 177         struct dst_entry *dst = skb_dst(skb);
 178         struct ipv6hdr *hdr;
 179         u8  proto = fl6->flowi6_proto;
 180         int seg_len = skb->len;
 181         int hlimit = -1;
 182         u32 mtu;
 183
 184         if (opt) {
 185                 unsigned int head_room;
 186
 187                 /* First: exthdrs may take lots of space (~8K for now)
 188                    MAX_HEADER is not enough.
 189                  */
 190                 head_room = opt->opt_nflen + opt->opt_flen;
 191                 seg_len += head_room;
 192                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 193
 194                 if (skb_headroom(skb) < head_room) {
 195                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 196                         if (skb2 == NULL) {
 197                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 198                                               IPSTATS_MIB_OUTDISCARDS);
 199                                 kfree_skb(skb);
 200                                 return -ENOBUFS;
 201                         }
 202                         consume_skb(skb);
 203                         skb = skb2;
 204                         skb_set_owner_w(skb, sk);
 205                 }
 206                 if (opt->opt_flen)
 207                         ipv6_push_frag_opts(skb, opt, &proto);
 208                 if (opt->opt_nflen)
 209                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 210         }
 211
 212         skb_push(skb, sizeof(struct ipv6hdr));
 213         skb_reset_network_header(skb);
 214         hdr = ipv6_hdr(skb);
 215
 216         /*
 217          *      Fill in the IPv6 header
 218          */
 219         if (np)
 220                 hlimit = np->hop_limit;
 221         if (hlimit < 0)
 222                 hlimit = ip6_dst_hoplimit(dst);
 223
 224         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 225
 226         hdr->payload_len = htons(seg_len);
 227         hdr->nexthdr = proto;
 228         hdr->hop_limit = hlimit;
 229
 230         hdr->saddr = fl6->saddr;
 231         hdr->daddr = *first_hop;
 232
 233         skb->priority = sk->sk_priority;
 234         skb->mark = sk->sk_mark;
 235
 236         mtu = dst_mtu(dst);
 237         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 238                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 239                               IPSTATS_MIB_OUT, skb->len);
 240                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 241                                dst->dev, dst_output);
 242         }
 243
 244         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
 245         skb->dev = dst->dev;
 246         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 247         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 248         kfree_skb(skb);
 249         return -EMSGSIZE;
 250 }
 251
 252 EXPORT_SYMBOL(ip6_xmit);
 253
 254 /*
 255  *      To avoid extra problems ND packets are send through this
 256  *      routine. It's code duplication but I really want to avoid
 257  *      extra checks since ipv6_build_header is used by TCP (which
 258  *      is for us performance critical)
 259  */
 260
 261 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 262                const struct in6_addr *saddr, const struct in6_addr *daddr,
 263                int proto, int len)
 264 {
 265         struct ipv6_pinfo *np = inet6_sk(sk);
 266         struct ipv6hdr *hdr;
 267
 268         skb->protocol = htons(ETH_P_IPV6);
 269         skb->dev = dev;
 270
 271         skb_reset_network_header(skb);
 272         skb_put(skb, sizeof(struct ipv6hdr));
 273         hdr = ipv6_hdr(skb);
 274
 275         *(__be32*)hdr = htonl(0x60000000);
 276
 277         hdr->payload_len = htons(len);
 278         hdr->nexthdr = proto;
 279         hdr->hop_limit = np->hop_limit;
 280
 281         hdr->saddr = *saddr;
 282         hdr->daddr = *daddr;
 283
 284         return 0;
 285 }
 286
 287 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 288 {
 289         struct ip6_ra_chain *ra;
 290         struct sock *last = NULL;
 291
 292         read_lock(&ip6_ra_lock);
 293         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 294                 struct sock *sk = ra->sk;
 295                 if (sk && ra->sel == sel &&
 296                     (!sk->sk_bound_dev_if ||
 297                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 298                         if (last) {
 299                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 300                                 if (skb2)
 301                                         rawv6_rcv(last, skb2);
 302                         }
 303                         last = sk;
 304                 }
 305         }
 306
 307         if (last) {
 308                 rawv6_rcv(last, skb);
 309                 read_unlock(&ip6_ra_lock);
 310                 return 1;
 311         }
 312         read_unlock(&ip6_ra_lock);
 313         return 0;
 314 }
 315
 316 static int ip6_forward_proxy_check(struct sk_buff *skb)
 317 {
 318         struct ipv6hdr *hdr = ipv6_hdr(skb);
 319         u8 nexthdr = hdr->nexthdr;
 320         __be16 frag_off;
 321         int offset;
 322
 323         if (ipv6_ext_hdr(nexthdr)) {
 324                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 325                 if (offset < 0)
 326                         return 0;
 327         } else
 328                 offset = sizeof(struct ipv6hdr);
 329
 330         if (nexthdr == IPPROTO_ICMPV6) {
 331                 struct icmp6hdr *icmp6;
 332
 333                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 334                                          offset + 1 - skb->data)))
 335                         return 0;
 336
 337                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 338
 339                 switch (icmp6->icmp6_type) {
 340                 case NDISC_ROUTER_SOLICITATION:
 341                 case NDISC_ROUTER_ADVERTISEMENT:
 342                 case NDISC_NEIGHBOUR_SOLICITATION:
 343                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 344                 case NDISC_REDIRECT:
 345                         /* For reaction involving unicast neighbor discovery
 346                          * message destined to the proxied address, pass it to
 347                          * input function.
 348                          */
 349                         return 1;
 350                 default:
 351                         break;
 352                 }
 353         }
 354
 355         /*
 356          * The proxying router can't forward traffic sent to a link-local
 357          * address, so signal the sender and discard the packet. This
 358          * behavior is clarified by the MIPv6 specification.
 359          */
 360         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 361                 dst_link_failure(skb);
 362                 return -1;
 363         }
 364
 365         return 0;
 366 }
 367
 368 static inline int ip6_forward_finish(struct sk_buff *skb)
 369 {
 370         return dst_output(skb);
 371 }
 372
 373 int ip6_forward(struct sk_buff *skb)
 374 {
 375         struct dst_entry *dst = skb_dst(skb);
 376         struct ipv6hdr *hdr = ipv6_hdr(skb);
 377         struct inet6_skb_parm *opt = IP6CB(skb);
 378         struct net *net = dev_net(dst->dev);
 379         u32 mtu;
 380
 381         if (net->ipv6.devconf_all->forwarding == 0)
 382                 goto error;
 383
 384         if (skb_warn_if_lro(skb))
 385                 goto drop;
 386
 387         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 388                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 389                 goto drop;
 390         }
 391
 392         if (skb->pkt_type != PACKET_HOST)
 393                 goto drop;
 394
 395         skb_forward_csum(skb);
 396
 397         /*
 398          *      We DO NOT make any processing on
 399          *      RA packets, pushing them to user level AS IS
 400          *      without ane WARRANTY that application will be able
 401          *      to interpret them. The reason is that we
 402          *      cannot make anything clever here.
 403          *
 404          *      We are not end-node, so that if packet contains
 405          *      AH/ESP, we cannot make anything.
 406          *      Defragmentation also would be mistake, RA packets
 407          *      cannot be fragmented, because there is no warranty
 408          *      that different fragments will go along one path. --ANK
 409          */
 410         if (opt->ra) {
 411                 u8 *ptr = skb_network_header(skb) + opt->ra;
 412                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 413                         return 0;
 414         }
 415
 416         /*
 417          *      check and decrement ttl
 418          */
 419         if (hdr->hop_limit <= 1) {
 420                 /* Force OUTPUT device used as source address */
 421                 skb->dev = dst->dev;
 422                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 423                 IP6_INC_STATS_BH(net,
 424                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 425
 426                 kfree_skb(skb);
 427                 return -ETIMEDOUT;
 428         }
 429
 430         /* XXX: idev->cnf.proxy_ndp? */
 431         if (net->ipv6.devconf_all->proxy_ndp &&
 432             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 433                 int proxied = ip6_forward_proxy_check(skb);
 434                 if (proxied > 0)
 435                         return ip6_input(skb);
 436                 else if (proxied < 0) {
 437                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 438                                       IPSTATS_MIB_INDISCARDS);
 439                         goto drop;
 440                 }
 441         }
 442
 443         if (!xfrm6_route_forward(skb)) {
 444                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 445                 goto drop;
 446         }
 447         dst = skb_dst(skb);
 448
 449         /* IPv6 specs say nothing about it, but it is clear that we cannot
 450            send redirects to source routed frames.
 451            We don't send redirects to frames decapsulated from IPsec.
 452          */
 453         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 454                 struct in6_addr *target = NULL;
 455                 struct inet_peer *peer;
 456                 struct rt6_info *rt;
 457
 458                 /*
 459                  *      incoming and outgoing devices are the same
 460                  *      send a redirect.
 461                  */
 462
 463                 rt = (struct rt6_info *) dst;
 464                 if (rt->rt6i_flags & RTF_GATEWAY)
 465                         target = &rt->rt6i_gateway;
 466                 else
 467                         target = &hdr->daddr;
 468
 469                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 470
 471                 /* Limit redirects both by destination (here)
 472                    and by source (inside ndisc_send_redirect)
 473                  */
 474                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 475                         ndisc_send_redirect(skb, target);
 476                 if (peer)
 477                         inet_putpeer(peer);
 478         } else {
 479                 int addrtype = ipv6_addr_type(&hdr->saddr);
 480
 481                 /* This check is security critical. */
 482                 if (addrtype == IPV6_ADDR_ANY ||
 483                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 484                         goto error;
 485                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 486                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 487                                     ICMPV6_NOT_NEIGHBOUR, 0);
 488                         goto error;
 489                 }
 490         }
 491
 492         mtu = dst_mtu(dst);
 493         if (mtu < IPV6_MIN_MTU)
 494                 mtu = IPV6_MIN_MTU;
 495
 496         if (skb->len > mtu && !skb_is_gso(skb)) {
 497                 /* Again, force OUTPUT device used as source address */
 498                 skb->dev = dst->dev;
 499                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 500                 IP6_INC_STATS_BH(net,
 501                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 502                 IP6_INC_STATS_BH(net,
 503                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 504                 kfree_skb(skb);
 505                 return -EMSGSIZE;
 506         }
 507
 508         if (skb_cow(skb, dst->dev->hard_header_len)) {
 509                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 510                 goto drop;
 511         }
 512
 513         hdr = ipv6_hdr(skb);
 514
 515         /* Mangling hops number delayed to point after skb COW */
 516
 517         hdr->hop_limit--;
 518
 519         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 520         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 521         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 522                        ip6_forward_finish);
 523
 524 error:
 525         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 526 drop:
 527         kfree_skb(skb);
 528         return -EINVAL;
 529 }
 530
 531 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 532 {
 533         to->pkt_type = from->pkt_type;
 534         to->priority = from->priority;
 535         to->protocol = from->protocol;
 536         skb_dst_drop(to);
 537         skb_dst_set(to, dst_clone(skb_dst(from)));
 538         to->dev = from->dev;
 539         to->mark = from->mark;
 540
 541 #ifdef CONFIG_NET_SCHED
 542         to->tc_index = from->tc_index;
 543 #endif
 544         nf_copy(to, from);
 545 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 546     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 547         to->nf_trace = from->nf_trace;
 548 #endif
 549         skb_copy_secmark(to, from);
 550 }
 551
 552 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 553 {
 554         u16 offset = sizeof(struct ipv6hdr);
 555         struct ipv6_opt_hdr *exthdr =
 556                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 557         unsigned int packet_len = skb->tail - skb->network_header;
 558         int found_rhdr = 0;
 559         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 560
 561         while (offset + 1 <= packet_len) {
 562
 563                 switch (**nexthdr) {
 564
 565                 case NEXTHDR_HOP:
 566                         break;
 567                 case NEXTHDR_ROUTING:
 568                         found_rhdr = 1;
 569                         break;
 570                 case NEXTHDR_DEST:
 571 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 572                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 573                                 break;
 574 #endif
 575                         if (found_rhdr)
 576                                 return offset;
 577                         break;
 578                 default :
 579                         return offset;
 580                 }
 581
 582                 offset += ipv6_optlen(exthdr);
 583                 *nexthdr = &exthdr->nexthdr;
 584                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 585                                                  offset);
 586         }
 587
 588         return offset;
 589 }
 590
 591 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 592 {
 593         static atomic_t ipv6_fragmentation_id;
 594         int old, new;
 595
 596         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 597                 struct inet_peer *peer;
 598                 struct net *net;
 599
 600                 net = dev_net(rt->dst.dev);
 601                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 602                 if (peer) {
 603                         fhdr->identification = htonl(inet_getid(peer, 0));
 604                         inet_putpeer(peer);
 605                         return;
 606                 }
 607         }
 608         do {
 609                 old = atomic_read(&ipv6_fragmentation_id);
 610                 new = old + 1;
 611                 if (!new)
 612                         new = 1;
 613         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 614         fhdr->identification = htonl(new);
 615 }
 616
 617 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 618 {
 619         struct sk_buff *frag;
 620         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 621         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 622         struct ipv6hdr *tmp_hdr;
 623         struct frag_hdr *fh;
 624         unsigned int mtu, hlen, left, len;
 625         int hroom, troom;
 626         __be32 frag_id = 0;
 627         int ptr, offset = 0, err=0;
 628         u8 *prevhdr, nexthdr = 0;
 629         struct net *net = dev_net(skb_dst(skb)->dev);
 630
 631         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 632         nexthdr = *prevhdr;
 633
 634         mtu = ip6_skb_dst_mtu(skb);
 635
 636         /* We must not fragment if the socket is set to force MTU discovery
 637          * or if the skb it not generated by a local socket.
 638          */
 639         if (unlikely(!skb->local_df && skb->len > mtu)) {
 640                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 641                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 642
 643                 skb->dev = skb_dst(skb)->dev;
 644                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 645                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 646                               IPSTATS_MIB_FRAGFAILS);
 647                 kfree_skb(skb);
 648                 return -EMSGSIZE;
 649         }
 650
 651         if (np && np->frag_size < mtu) {
 652                 if (np->frag_size)
 653                         mtu = np->frag_size;
 654         }
 655         mtu -= hlen + sizeof(struct frag_hdr);
 656
 657         if (skb_has_frag_list(skb)) {
 658                 int first_len = skb_pagelen(skb);
 659                 struct sk_buff *frag2;
 660
 661                 if (first_len - hlen > mtu ||
 662                     ((first_len - hlen) & 7) ||
 663                     skb_cloned(skb))
 664                         goto slow_path;
 665
 666                 skb_walk_frags(skb, frag) {
 667                         /* Correct geometry. */
 668                         if (frag->len > mtu ||
 669                             ((frag->len & 7) && frag->next) ||
 670                             skb_headroom(frag) < hlen)
 671                                 goto slow_path_clean;
 672
 673                         /* Partially cloned skb? */
 674                         if (skb_shared(frag))
 675                                 goto slow_path_clean;
 676
 677                         BUG_ON(frag->sk);
 678                         if (skb->sk) {
 679                                 frag->sk = skb->sk;
 680                                 frag->destructor = sock_wfree;
 681                         }
 682                         skb->truesize -= frag->truesize;
 683                 }
 684
 685                 err = 0;
 686                 offset = 0;
 687                 frag = skb_shinfo(skb)->frag_list;
 688                 skb_frag_list_init(skb);
 689                 /* BUILD HEADER */
 690
 691                 *prevhdr = NEXTHDR_FRAGMENT;
 692                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 693                 if (!tmp_hdr) {
 694                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 695                                       IPSTATS_MIB_FRAGFAILS);
 696                         return -ENOMEM;
 697                 }
 698
 699                 __skb_pull(skb, hlen);
 700                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 701                 __skb_push(skb, hlen);
 702                 skb_reset_network_header(skb);
 703                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 704
 705                 ipv6_select_ident(fh, rt);
 706                 fh->nexthdr = nexthdr;
 707                 fh->reserved = 0;
 708                 fh->frag_off = htons(IP6_MF);
 709                 frag_id = fh->identification;
 710
 711                 first_len = skb_pagelen(skb);
 712                 skb->data_len = first_len - skb_headlen(skb);
 713                 skb->len = first_len;
 714                 ipv6_hdr(skb)->payload_len = htons(first_len -
 715                                                    sizeof(struct ipv6hdr));
 716
 717                 dst_hold(&rt->dst);
 718
 719                 for (;;) {
 720                         /* Prepare header of the next frame,
 721                          * before previous one went down. */
 722                         if (frag) {
 723                                 frag->ip_summed = CHECKSUM_NONE;
 724                                 skb_reset_transport_header(frag);
 725                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 726                                 __skb_push(frag, hlen);
 727                                 skb_reset_network_header(frag);
 728                                 memcpy(skb_network_header(frag), tmp_hdr,
 729                                        hlen);
 730                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 731                                 fh->nexthdr = nexthdr;
 732                                 fh->reserved = 0;
 733                                 fh->frag_off = htons(offset);
 734                                 if (frag->next != NULL)
 735                                         fh->frag_off |= htons(IP6_MF);
 736                                 fh->identification = frag_id;
 737                                 ipv6_hdr(frag)->payload_len =
 738                                                 htons(frag->len -
 739                                                       sizeof(struct ipv6hdr));
 740                                 ip6_copy_metadata(frag, skb);
 741                         }
 742
 743                         err = output(skb);
 744                         if(!err)
 745                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 746                                               IPSTATS_MIB_FRAGCREATES);
 747
 748                         if (err || !frag)
 749                                 break;
 750
 751                         skb = frag;
 752                         frag = skb->next;
 753                         skb->next = NULL;
 754                 }
 755
 756                 kfree(tmp_hdr);
 757
 758                 if (err == 0) {
 759                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 760                                       IPSTATS_MIB_FRAGOKS);
 761                         dst_release(&rt->dst);
 762                         return 0;
 763                 }
 764
 765                 while (frag) {
 766                         skb = frag->next;
 767                         kfree_skb(frag);
 768                         frag = skb;
 769                 }
 770
 771                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 772                               IPSTATS_MIB_FRAGFAILS);
 773                 dst_release(&rt->dst);
 774                 return err;
 775
 776 slow_path_clean:
 777                 skb_walk_frags(skb, frag2) {
 778                         if (frag2 == frag)
 779                                 break;
 780                         frag2->sk = NULL;
 781                         frag2->destructor = NULL;
 782                         skb->truesize += frag2->truesize;
 783                 }
 784         }
 785
 786 slow_path:
 787         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 788             skb_checksum_help(skb))
 789                 goto fail;
 790
 791         left = skb->len - hlen;         /* Space per frame */
 792         ptr = hlen;                     /* Where to start from */
 793
 794         /*
 795          *      Fragment the datagram.
 796          */
 797
 798         *prevhdr = NEXTHDR_FRAGMENT;
 799         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 800         troom = rt->dst.dev->needed_tailroom;
 801
 802         /*
 803          *      Keep copying data until we run out.
 804          */
 805         while(left > 0) {
 806                 len = left;
 807                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 808                 if (len > mtu)
 809                         len = mtu;
 810                 /* IF: we are not sending up to and including the packet end
 811                    then align the next start on an eight byte boundary */
 812                 if (len < left) {
 813                         len &= ~7;
 814                 }
 815                 /*
 816                  *      Allocate buffer.
 817                  */
 818
 819                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 820                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 821                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 822                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 823                                       IPSTATS_MIB_FRAGFAILS);
 824                         err = -ENOMEM;
 825                         goto fail;
 826                 }
 827
 828                 /*
 829                  *      Set up data on packet
 830                  */
 831
 832                 ip6_copy_metadata(frag, skb);
 833                 skb_reserve(frag, hroom);
 834                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 835                 skb_reset_network_header(frag);
 836                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 837                 frag->transport_header = (frag->network_header + hlen +
 838                                           sizeof(struct frag_hdr));
 839
 840                 /*
 841                  *      Charge the memory for the fragment to any owner
 842                  *      it might possess
 843                  */
 844                 if (skb->sk)
 845                         skb_set_owner_w(frag, skb->sk);
 846
 847                 /*
 848                  *      Copy the packet header into the new buffer.
 849                  */
 850                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 851
 852                 /*
 853                  *      Build fragment header.
 854                  */
 855                 fh->nexthdr = nexthdr;
 856                 fh->reserved = 0;
 857                 if (!frag_id) {
 858                         ipv6_select_ident(fh, rt);
 859                         frag_id = fh->identification;
 860                 } else
 861                         fh->identification = frag_id;
 862
 863                 /*
 864                  *      Copy a block of the IP datagram.
 865                  */
 866                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 867                         BUG();
 868                 left -= len;
 869
 870                 fh->frag_off = htons(offset);
 871                 if (left > 0)
 872                         fh->frag_off |= htons(IP6_MF);
 873                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 874                                                     sizeof(struct ipv6hdr));
 875
 876                 ptr += len;
 877                 offset += len;
 878
 879                 /*
 880                  *      Put this fragment into the sending queue.
 881                  */
 882                 err = output(frag);
 883                 if (err)
 884                         goto fail;
 885
 886                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 887                               IPSTATS_MIB_FRAGCREATES);
 888         }
 889         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 890                       IPSTATS_MIB_FRAGOKS);
 891         consume_skb(skb);
 892         return err;
 893
 894 fail:
 895         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 896                       IPSTATS_MIB_FRAGFAILS);
 897         kfree_skb(skb);
 898         return err;
 899 }
 900
 901 static inline int ip6_rt_check(const struct rt6key *rt_key,
 902                                const struct in6_addr *fl_addr,
 903                                const struct in6_addr *addr_cache)
 904 {
 905         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 906                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 907 }
 908
 909 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 910                                           struct dst_entry *dst,
 911                                           const struct flowi6 *fl6)
 912 {
 913         struct ipv6_pinfo *np = inet6_sk(sk);
 914         struct rt6_info *rt = (struct rt6_info *)dst;
 915
 916         if (!dst)
 917                 goto out;
 918
 919         /* Yes, checking route validity in not connected
 920          * case is not very simple. Take into account,
 921          * that we do not support routing by source, TOS,
 922          * and MSG_DONTROUTE            --ANK (980726)
 923          *
 924          * 1. ip6_rt_check(): If route was host route,
 925          *    check that cached destination is current.
 926          *    If it is network route, we still may
 927          *    check its validity using saved pointer
 928          *    to the last used address: daddr_cache.
 929          *    We do not want to save whole address now,
 930          *    (because main consumer of this service
 931          *    is tcp, which has not this problem),
 932          *    so that the last trick works only on connected
 933          *    sockets.
 934          * 2. oif also should be the same.
 935          */
 936         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 937 #ifdef CONFIG_IPV6_SUBTREES
 938             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 939 #endif
 940             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 941                 dst_release(dst);
 942                 dst = NULL;
 943         }
 944
 945 out:
 946         return dst;
 947 }
 948
 949 static int ip6_dst_lookup_tail(struct sock *sk,
 950                                struct dst_entry **dst, struct flowi6 *fl6)
 951 {
 952         struct net *net = sock_net(sk);
 953 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 954         struct neighbour *n;
 955         struct rt6_info *rt;
 956 #endif
 957         int err;
 958
 959         if (*dst == NULL)
 960                 *dst = ip6_route_output(net, sk, fl6);
 961
 962         if ((err = (*dst)->error))
 963                 goto out_err_release;
 964
 965         if (ipv6_addr_any(&fl6->saddr)) {
 966                 struct rt6_info *rt = (struct rt6_info *) *dst;
 967                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 968                                           sk ? inet6_sk(sk)->srcprefs : 0,
 969                                           &fl6->saddr);
 970                 if (err)
 971                         goto out_err_release;
 972         }
 973
 974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 975         /*
 976          * Here if the dst entry we've looked up
 977          * has a neighbour entry that is in the INCOMPLETE
 978          * state and the src address from the flow is
 979          * marked as OPTIMISTIC, we release the found
 980          * dst entry and replace it instead with the
 981          * dst entry of the nexthop router
 982          */
 983         rcu_read_lock();
 984         rt = (struct rt6_info *) *dst;
 985         n = rt->n;
 986         if (n && !(n->nud_state & NUD_VALID)) {
 987                 struct inet6_ifaddr *ifp;
 988                 struct flowi6 fl_gw6;
 989                 int redirect;
 990
 991                 rcu_read_unlock();
 992                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 993                                       (*dst)->dev, 1);
 994
 995                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 996                 if (ifp)
 997                         in6_ifa_put(ifp);
 998
 999                 if (redirect) {
1000                         /*
1001                          * We need to get the dst entry for the
1002                          * default router instead
1003                          */
1004                         dst_release(*dst);
1005                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1006                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1007                         *dst = ip6_route_output(net, sk, &fl_gw6);
1008                         if ((err = (*dst)->error))
1009                                 goto out_err_release;
1010                 }
1011         } else {
1012                 rcu_read_unlock();
1013         }
1014 #endif
1015
1016         return 0;
1017
1018 out_err_release:
1019         if (err == -ENETUNREACH)
1020                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1021         dst_release(*dst);
1022         *dst = NULL;
1023         return err;
1024 }
1025
1026 /**
1027  *      ip6_dst_lookup - perform route lookup on flow
1028  *      @sk: socket which provides route info
1029  *      @dst: pointer to dst_entry * for result
1030  *      @fl6: flow to lookup
1031  *
1032  *      This function performs a route lookup on the given flow.
1033  *
1034  *      It returns zero on success, or a standard errno code on error.
1035  */
1036 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1037 {
1038         *dst = NULL;
1039         return ip6_dst_lookup_tail(sk, dst, fl6);
1040 }
1041 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1042
1043 /**
1044  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1045  *      @sk: socket which provides route info
1046  *      @fl6: flow to lookup
1047  *      @final_dst: final destination address for ipsec lookup
1048  *      @can_sleep: we are in a sleepable context
1049  *
1050  *      This function performs a route lookup on the given flow.
1051  *
1052  *      It returns a valid dst pointer on success, or a pointer encoded
1053  *      error code.
1054  */
1055 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1056                                       const struct in6_addr *final_dst,
1057                                       bool can_sleep)
1058 {
1059         struct dst_entry *dst = NULL;
1060         int err;
1061
1062         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1063         if (err)
1064                 return ERR_PTR(err);
1065         if (final_dst)
1066                 fl6->daddr = *final_dst;
1067         if (can_sleep)
1068                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1069
1070         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1071 }
1072 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1073
1074 /**
1075  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1076  *      @sk: socket which provides the dst cache and route info
1077  *      @fl6: flow to lookup
1078  *      @final_dst: final destination address for ipsec lookup
1079  *      @can_sleep: we are in a sleepable context
1080  *
1081  *      This function performs a route lookup on the given flow with the
1082  *      possibility of using the cached route in the socket if it is valid.
1083  *      It will take the socket dst lock when operating on the dst cache.
1084  *      As a result, this function can only be used in process context.
1085  *
1086  *      It returns a valid dst pointer on success, or a pointer encoded
1087  *      error code.
1088  */
1089 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1090                                          const struct in6_addr *final_dst,
1091                                          bool can_sleep)
1092 {
1093         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1094         int err;
1095
1096         dst = ip6_sk_dst_check(sk, dst, fl6);
1097
1098         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1099         if (err)
1100                 return ERR_PTR(err);
1101         if (final_dst)
1102                 fl6->daddr = *final_dst;
1103         if (can_sleep)
1104                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1105
1106         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1109
1110 static inline int ip6_ufo_append_data(struct sock *sk,
1111                         int getfrag(void *from, char *to, int offset, int len,
1112                         int odd, struct sk_buff *skb),
1113                         void *from, int length, int hh_len, int fragheaderlen,
1114                         int transhdrlen, int mtu,unsigned int flags,
1115                         struct rt6_info *rt)
1116
1117 {
1118         struct sk_buff *skb;
1119         int err;
1120
1121         /* There is support for UDP large send offload by network
1122          * device, so create one single skb packet containing complete
1123          * udp datagram
1124          */
1125         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1126                 skb = sock_alloc_send_skb(sk,
1127                         hh_len + fragheaderlen + transhdrlen + 20,
1128                         (flags & MSG_DONTWAIT), &err);
1129                 if (skb == NULL)
1130                         return err;
1131
1132                 /* reserve space for Hardware header */
1133                 skb_reserve(skb, hh_len);
1134
1135                 /* create space for UDP/IP header */
1136                 skb_put(skb,fragheaderlen + transhdrlen);
1137
1138                 /* initialize network header pointer */
1139                 skb_reset_network_header(skb);
1140
1141                 /* initialize protocol header pointer */
1142                 skb->transport_header = skb->network_header + fragheaderlen;
1143
1144                 skb->ip_summed = CHECKSUM_PARTIAL;
1145                 skb->csum = 0;
1146         }
1147
1148         err = skb_append_datato_frags(sk,skb, getfrag, from,
1149                                       (length - transhdrlen));
1150         if (!err) {
1151                 struct frag_hdr fhdr;
1152
1153                 /* Specify the length of each IPv6 datagram fragment.
1154                  * It has to be a multiple of 8.
1155                  */
1156                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1157                                              sizeof(struct frag_hdr)) & ~7;
1158                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1159                 ipv6_select_ident(&fhdr, rt);
1160                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1161                 __skb_queue_tail(&sk->sk_write_queue, skb);
1162
1163                 return 0;
1164         }
1165         /* There is not enough support do UPD LSO,
1166          * so follow normal path
1167          */
1168         kfree_skb(skb);
1169
1170         return err;
1171 }
1172
1173 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1174                                                gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1180                                                 gfp_t gfp)
1181 {
1182         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184
1185 static void ip6_append_data_mtu(int *mtu,
1186                                 int *maxfraglen,
1187                                 unsigned int fragheaderlen,
1188                                 struct sk_buff *skb,
1189                                 struct rt6_info *rt)
1190 {
1191         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1192                 if (skb == NULL) {
1193                         /* first fragment, reserve header_len */
1194                         *mtu = *mtu - rt->dst.header_len;
1195
1196                 } else {
1197                         /*
1198                          * this fragment is not first, the headers
1199                          * space is regarded as data space.
1200                          */
1201                         *mtu = dst_mtu(rt->dst.path);
1202                 }
1203                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1204                               + fragheaderlen - sizeof(struct frag_hdr);
1205         }
1206 }
1207
1208 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1209         int offset, int len, int odd, struct sk_buff *skb),
1210         void *from, int length, int transhdrlen,
1211         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1212         struct rt6_info *rt, unsigned int flags, int dontfrag)
1213 {
1214         struct inet_sock *inet = inet_sk(sk);
1215         struct ipv6_pinfo *np = inet6_sk(sk);
1216         struct inet_cork *cork;
1217         struct sk_buff *skb, *skb_prev = NULL;
1218         unsigned int maxfraglen, fragheaderlen;
1219         int exthdrlen;
1220         int dst_exthdrlen;
1221         int hh_len;
1222         int mtu;
1223         int copy;
1224         int err;
1225         int offset = 0;
1226         __u8 tx_flags = 0;
1227
1228         if (flags&MSG_PROBE)
1229                 return 0;
1230         cork = &inet->cork.base;
1231         if (skb_queue_empty(&sk->sk_write_queue)) {
1232                 /*
1233                  * setup for corking
1234                  */
1235                 if (opt) {
1236                         if (WARN_ON(np->cork.opt))
1237                                 return -EINVAL;
1238
1239                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1240                         if (unlikely(np->cork.opt == NULL))
1241                                 return -ENOBUFS;
1242
1243                         np->cork.opt->tot_len = opt->tot_len;
1244                         np->cork.opt->opt_flen = opt->opt_flen;
1245                         np->cork.opt->opt_nflen = opt->opt_nflen;
1246
1247                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1248                                                             sk->sk_allocation);
1249                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1250                                 return -ENOBUFS;
1251
1252                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1253                                                             sk->sk_allocation);
1254                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1255                                 return -ENOBUFS;
1256
1257                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1258                                                            sk->sk_allocation);
1259                         if (opt->hopopt && !np->cork.opt->hopopt)
1260                                 return -ENOBUFS;
1261
1262                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1263                                                             sk->sk_allocation);
1264                         if (opt->srcrt && !np->cork.opt->srcrt)
1265                                 return -ENOBUFS;
1266
1267                         /* need source address above miyazawa*/
1268                 }
1269                 dst_hold(&rt->dst);
1270                 cork->dst = &rt->dst;
1271                 inet->cork.fl.u.ip6 = *fl6;
1272                 np->cork.hop_limit = hlimit;
1273                 np->cork.tclass = tclass;
1274                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1275                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1276                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1277                 else
1278                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1279                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1280                 if (np->frag_size < mtu) {
1281                         if (np->frag_size)
1282                                 mtu = np->frag_size;
1283                 }
1284                 cork->fragsize = mtu;
1285                 if (dst_allfrag(rt->dst.path))
1286                         cork->flags |= IPCORK_ALLFRAG;
1287                 cork->length = 0;
1288                 sk->sk_sndmsg_page = NULL;
1289                 sk->sk_sndmsg_off = 0;
1290                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1291                 length += exthdrlen;
1292                 transhdrlen += exthdrlen;
1293                 dst_exthdrlen = rt->dst.header_len;
1294         } else {
1295                 rt = (struct rt6_info *)cork->dst;
1296                 fl6 = &inet->cork.fl.u.ip6;
1297                 opt = np->cork.opt;
1298                 transhdrlen = 0;
1299                 exthdrlen = 0;
1300                 dst_exthdrlen = 0;
1301                 mtu = cork->fragsize;
1302         }
1303
1304         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1305
1306         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1307                         (opt ? opt->opt_nflen : 0);
1308         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1309
1310         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1311                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1312                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1313                         return -EMSGSIZE;
1314                 }
1315         }
1316
1317         /* For UDP, check if TX timestamp is enabled */
1318         if (sk->sk_type == SOCK_DGRAM) {
1319                 err = sock_tx_timestamp(sk, &tx_flags);
1320                 if (err)
1321                         goto error;
1322         }
1323
1324         /*
1325          * Let's try using as much space as possible.
1326          * Use MTU if total length of the message fits into the MTU.
1327          * Otherwise, we need to reserve fragment header and
1328          * fragment alignment (= 8-15 octects, in total).
1329          *
1330          * Note that we may need to "move" the data from the tail of
1331          * of the buffer to the new fragment when we split
1332          * the message.
1333          *
1334          * FIXME: It may be fragmented into multiple chunks
1335          *        at once if non-fragmentable extension headers
1336          *        are too large.
1337          * --yoshfuji
1338          */
1339
1340         cork->length += length;
1341         if (length > mtu) {
1342                 int proto = sk->sk_protocol;
1343                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1344                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1345                         return -EMSGSIZE;
1346                 }
1347
1348                 if (proto == IPPROTO_UDP &&
1349                     (rt->dst.dev->features & NETIF_F_UFO)) {
1350
1351                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1352                                                   hh_len, fragheaderlen,
1353                                                   transhdrlen, mtu, flags, rt);
1354                         if (err)
1355                                 goto error;
1356                         return 0;
1357                 }
1358         }
1359
1360         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1361                 goto alloc_new_skb;
1362
1363         while (length > 0) {
1364                 /* Check if the remaining data fits into current packet. */
1365                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1366                 if (copy < length)
1367                         copy = maxfraglen - skb->len;
1368
1369                 if (copy <= 0) {
1370                         char *data;
1371                         unsigned int datalen;
1372                         unsigned int fraglen;
1373                         unsigned int fraggap;
1374                         unsigned int alloclen;
1375 alloc_new_skb:
1376                         /* There's no room in the current skb */
1377                         if (skb)
1378                                 fraggap = skb->len - maxfraglen;
1379                         else
1380                                 fraggap = 0;
1381                         /* update mtu and maxfraglen if necessary */
1382                         if (skb == NULL || skb_prev == NULL)
1383                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1384                                                     fragheaderlen, skb, rt);
1385
1386                         skb_prev = skb;
1387
1388                         /*
1389                          * If remaining data exceeds the mtu,
1390                          * we know we need more fragment(s).
1391                          */
1392                         datalen = length + fraggap;
1393
1394                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1395                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1396                         if ((flags & MSG_MORE) &&
1397                             !(rt->dst.dev->features&NETIF_F_SG))
1398                                 alloclen = mtu;
1399                         else
1400                                 alloclen = datalen + fragheaderlen;
1401
1402                         alloclen += dst_exthdrlen;
1403
1404                         if (datalen != length + fraggap) {
1405                                 /*
1406                                  * this is not the last fragment, the trailer
1407                                  * space is regarded as data space.
1408                                  */
1409                                 datalen += rt->dst.trailer_len;
1410                         }
1411
1412                         alloclen += rt->dst.trailer_len;
1413                         fraglen = datalen + fragheaderlen;
1414
1415                         /*
1416                          * We just reserve space for fragment header.
1417                          * Note: this may be overallocation if the message
1418                          * (without MSG_MORE) fits into the MTU.
1419                          */
1420                         alloclen += sizeof(struct frag_hdr);
1421
1422                         if (transhdrlen) {
1423                                 skb = sock_alloc_send_skb(sk,
1424                                                 alloclen + hh_len,
1425                                                 (flags & MSG_DONTWAIT), &err);
1426                         } else {
1427                                 skb = NULL;
1428                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1429                                     2 * sk->sk_sndbuf)
1430                                         skb = sock_wmalloc(sk,
1431                                                            alloclen + hh_len, 1,
1432                                                            sk->sk_allocation);
1433                                 if (unlikely(skb == NULL))
1434                                         err = -ENOBUFS;
1435                                 else {
1436                                         /* Only the initial fragment
1437                                          * is time stamped.
1438                                          */
1439                                         tx_flags = 0;
1440                                 }
1441                         }
1442                         if (skb == NULL)
1443                                 goto error;
1444                         /*
1445                          *      Fill in the control structures
1446                          */
1447                         skb->ip_summed = CHECKSUM_NONE;
1448                         skb->csum = 0;
1449                         /* reserve for fragmentation and ipsec header */
1450                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1451                                     dst_exthdrlen);
1452
1453                         if (sk->sk_type == SOCK_DGRAM)
1454                                 skb_shinfo(skb)->tx_flags = tx_flags;
1455
1456                         /*
1457                          *      Find where to start putting bytes
1458                          */
1459                         data = skb_put(skb, fraglen);
1460                         skb_set_network_header(skb, exthdrlen);
1461                         data += fragheaderlen;
1462                         skb->transport_header = (skb->network_header +
1463                                                  fragheaderlen);
1464                         if (fraggap) {
1465                                 skb->csum = skb_copy_and_csum_bits(
1466                                         skb_prev, maxfraglen,
1467                                         data + transhdrlen, fraggap, 0);
1468                                 skb_prev->csum = csum_sub(skb_prev->csum,
1469                                                           skb->csum);
1470                                 data += fraggap;
1471                                 pskb_trim_unique(skb_prev, maxfraglen);
1472                         }
1473                         copy = datalen - transhdrlen - fraggap;
1474
1475                         if (copy < 0) {
1476                                 err = -EINVAL;
1477                                 kfree_skb(skb);
1478                                 goto error;
1479                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1480                                 err = -EFAULT;
1481                                 kfree_skb(skb);
1482                                 goto error;
1483                         }
1484
1485                         offset += copy;
1486                         length -= datalen - fraggap;
1487                         transhdrlen = 0;
1488                         exthdrlen = 0;
1489                         dst_exthdrlen = 0;
1490
1491                         /*
1492                          * Put the packet on the pending queue
1493                          */
1494                         __skb_queue_tail(&sk->sk_write_queue, skb);
1495                         continue;
1496                 }
1497
1498                 if (copy > length)
1499                         copy = length;
1500
1501                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1502                         unsigned int off;
1503
1504                         off = skb->len;
1505                         if (getfrag(from, skb_put(skb, copy),
1506                                                 offset, copy, off, skb) < 0) {
1507                                 __skb_trim(skb, off);
1508                                 err = -EFAULT;
1509                                 goto error;
1510                         }
1511                 } else {
1512                         int i = skb_shinfo(skb)->nr_frags;
1513                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1514                         struct page *page = sk->sk_sndmsg_page;
1515                         int off = sk->sk_sndmsg_off;
1516                         unsigned int left;
1517
1518                         if (page && (left = PAGE_SIZE - off) > 0) {
1519                                 if (copy >= left)
1520                                         copy = left;
1521                                 if (page != skb_frag_page(frag)) {
1522                                         if (i == MAX_SKB_FRAGS) {
1523                                                 err = -EMSGSIZE;
1524                                                 goto error;
1525                                         }
1526                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1527                                         skb_frag_ref(skb, i);
1528                                         frag = &skb_shinfo(skb)->frags[i];
1529                                 }
1530                         } else if(i < MAX_SKB_FRAGS) {
1531                                 if (copy > PAGE_SIZE)
1532                                         copy = PAGE_SIZE;
1533                                 page = alloc_pages(sk->sk_allocation, 0);
1534                                 if (page == NULL) {
1535                                         err = -ENOMEM;
1536                                         goto error;
1537                                 }
1538                                 sk->sk_sndmsg_page = page;
1539                                 sk->sk_sndmsg_off = 0;
1540
1541                                 skb_fill_page_desc(skb, i, page, 0, 0);
1542                                 frag = &skb_shinfo(skb)->frags[i];
1543                         } else {
1544                                 err = -EMSGSIZE;
1545                                 goto error;
1546                         }
1547                         if (getfrag(from,
1548                                     skb_frag_address(frag) + skb_frag_size(frag),
1549                                     offset, copy, skb->len, skb) < 0) {
1550                                 err = -EFAULT;
1551                                 goto error;
1552                         }
1553                         sk->sk_sndmsg_off += copy;
1554                         skb_frag_size_add(frag, copy);
1555                         skb->len += copy;
1556                         skb->data_len += copy;
1557                         skb->truesize += copy;
1558                         atomic_add(copy, &sk->sk_wmem_alloc);
1559                 }
1560                 offset += copy;
1561                 length -= copy;
1562         }
1563         return 0;
1564 error:
1565         cork->length -= length;
1566         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1567         return err;
1568 }
1569 EXPORT_SYMBOL_GPL(ip6_append_data);
1570
1571 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1572 {
1573         if (np->cork.opt) {
1574                 kfree(np->cork.opt->dst0opt);
1575                 kfree(np->cork.opt->dst1opt);
1576                 kfree(np->cork.opt->hopopt);
1577                 kfree(np->cork.opt->srcrt);
1578                 kfree(np->cork.opt);
1579                 np->cork.opt = NULL;
1580         }
1581
1582         if (inet->cork.base.dst) {
1583                 dst_release(inet->cork.base.dst);
1584                 inet->cork.base.dst = NULL;
1585                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1586         }
1587         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1588 }
1589
1590 int ip6_push_pending_frames(struct sock *sk)
1591 {
1592         struct sk_buff *skb, *tmp_skb;
1593         struct sk_buff **tail_skb;
1594         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1595         struct inet_sock *inet = inet_sk(sk);
1596         struct ipv6_pinfo *np = inet6_sk(sk);
1597         struct net *net = sock_net(sk);
1598         struct ipv6hdr *hdr;
1599         struct ipv6_txoptions *opt = np->cork.opt;
1600         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1601         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1602         unsigned char proto = fl6->flowi6_proto;
1603         int err = 0;
1604
1605         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1606                 goto out;
1607         tail_skb = &(skb_shinfo(skb)->frag_list);
1608
1609         /* move skb->data to ip header from ext header */
1610         if (skb->data < skb_network_header(skb))
1611                 __skb_pull(skb, skb_network_offset(skb));
1612         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1613                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1614                 *tail_skb = tmp_skb;
1615                 tail_skb = &(tmp_skb->next);
1616                 skb->len += tmp_skb->len;
1617                 skb->data_len += tmp_skb->len;
1618                 skb->truesize += tmp_skb->truesize;
1619                 tmp_skb->destructor = NULL;
1620                 tmp_skb->sk = NULL;
1621         }
1622
1623         /* Allow local fragmentation. */
1624         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1625                 skb->local_df = 1;
1626
1627         *final_dst = fl6->daddr;
1628         __skb_pull(skb, skb_network_header_len(skb));
1629         if (opt && opt->opt_flen)
1630                 ipv6_push_frag_opts(skb, opt, &proto);
1631         if (opt && opt->opt_nflen)
1632                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1633
1634         skb_push(skb, sizeof(struct ipv6hdr));
1635         skb_reset_network_header(skb);
1636         hdr = ipv6_hdr(skb);
1637
1638         *(__be32*)hdr = fl6->flowlabel |
1639                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1640
1641         hdr->hop_limit = np->cork.hop_limit;
1642         hdr->nexthdr = proto;
1643         hdr->saddr = fl6->saddr;
1644         hdr->daddr = *final_dst;
1645
1646         skb->priority = sk->sk_priority;
1647         skb->mark = sk->sk_mark;
1648
1649         skb_dst_set(skb, dst_clone(&rt->dst));
1650         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1651         if (proto == IPPROTO_ICMPV6) {
1652                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1653
1654                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1655                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1656         }
1657
1658         err = ip6_local_out(skb);
1659         if (err) {
1660                 if (err > 0)
1661                         err = net_xmit_errno(err);
1662                 if (err)
1663                         goto error;
1664         }
1665
1666 out:
1667         ip6_cork_release(inet, np);
1668         return err;
1669 error:
1670         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1671         goto out;
1672 }
1673 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1674
1675 void ip6_flush_pending_frames(struct sock *sk)
1676 {
1677         struct sk_buff *skb;
1678
1679         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1680                 if (skb_dst(skb))
1681                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1682                                       IPSTATS_MIB_OUTDISCARDS);
1683                 kfree_skb(skb);
1684         }
1685
1686         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1687 }
1688 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);