net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         unsigned int head_room;
 199         struct ipv6hdr *hdr;
 200         u8  proto = fl6->flowi6_proto;
 201         int seg_len = skb->len;
 202         int hlimit = -1;
 203         u32 mtu;
 204
 205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206         if (opt)
 207                 head_room += opt->opt_nflen + opt->opt_flen;
 208
 209         if (unlikely(skb_headroom(skb) < head_room)) {
 210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                 if (!skb2) {
 212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                       IPSTATS_MIB_OUTDISCARDS);
 214                         kfree_skb(skb);
 215                         return -ENOBUFS;
 216                 }
 217                 if (skb->sk)
 218                         skb_set_owner_w(skb2, skb->sk);
 219                 consume_skb(skb);
 220                 skb = skb2;
 221         }
 222
 223         if (opt) {
 224                 seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                 if (opt->opt_flen)
 227                         ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                 if (opt->opt_nflen)
 230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                              &fl6->saddr);
 232         }
 233
 234         skb_push(skb, sizeof(struct ipv6hdr));
 235         skb_reset_network_header(skb);
 236         hdr = ipv6_hdr(skb);
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = ip6_dst_hoplimit(dst);
 245
 246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                 ip6_autoflowlabel(net, np), fl6));
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         hdr->saddr = fl6->saddr;
 254         hdr->daddr = *first_hop;
 255
 256         skb->protocol = htons(ETH_P_IPV6);
 257         skb->priority = sk->sk_priority;
 258         skb->mark = mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264
 265                 /* if egress device is enslaved to an L3 master device pass the
 266                  * skb to its handler for processing
 267                  */
 268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                 if (unlikely(!skb))
 270                         return 0;
 271
 272                 /* hooks should never assume socket lock is held.
 273                  * we promote our socket to non const
 274                  */
 275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                                net, (struct sock *)sk, skb, NULL, dst->dev,
 277                                dst_output);
 278         }
 279
 280         skb->dev = dst->dev;
 281         /* ipv6_local_error() does not require socket lock,
 282          * we promote our socket to non const
 283          */
 284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287         kfree_skb(skb);
 288         return -EMSGSIZE;
 289 }
 290 EXPORT_SYMBOL(ip6_xmit);
 291
 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293 {
 294         struct ip6_ra_chain *ra;
 295         struct sock *last = NULL;
 296
 297         read_lock(&ip6_ra_lock);
 298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                 struct sock *sk = ra->sk;
 300                 if (sk && ra->sel == sel &&
 301                     (!sk->sk_bound_dev_if ||
 302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                         if (last) {
 304                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 305                                 if (skb2)
 306                                         rawv6_rcv(last, skb2);
 307                         }
 308                         last = sk;
 309                 }
 310         }
 311
 312         if (last) {
 313                 rawv6_rcv(last, skb);
 314                 read_unlock(&ip6_ra_lock);
 315                 return 1;
 316         }
 317         read_unlock(&ip6_ra_lock);
 318         return 0;
 319 }
 320
 321 static int ip6_forward_proxy_check(struct sk_buff *skb)
 322 {
 323         struct ipv6hdr *hdr = ipv6_hdr(skb);
 324         u8 nexthdr = hdr->nexthdr;
 325         __be16 frag_off;
 326         int offset;
 327
 328         if (ipv6_ext_hdr(nexthdr)) {
 329                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 330                 if (offset < 0)
 331                         return 0;
 332         } else
 333                 offset = sizeof(struct ipv6hdr);
 334
 335         if (nexthdr == IPPROTO_ICMPV6) {
 336                 struct icmp6hdr *icmp6;
 337
 338                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 339                                          offset + 1 - skb->data)))
 340                         return 0;
 341
 342                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 343
 344                 switch (icmp6->icmp6_type) {
 345                 case NDISC_ROUTER_SOLICITATION:
 346                 case NDISC_ROUTER_ADVERTISEMENT:
 347                 case NDISC_NEIGHBOUR_SOLICITATION:
 348                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 349                 case NDISC_REDIRECT:
 350                         /* For reaction involving unicast neighbor discovery
 351                          * message destined to the proxied address, pass it to
 352                          * input function.
 353                          */
 354                         return 1;
 355                 default:
 356                         break;
 357                 }
 358         }
 359
 360         /*
 361          * The proxying router can't forward traffic sent to a link-local
 362          * address, so signal the sender and discard the packet. This
 363          * behavior is clarified by the MIPv6 specification.
 364          */
 365         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 366                 dst_link_failure(skb);
 367                 return -1;
 368         }
 369
 370         return 0;
 371 }
 372
 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 374                                      struct sk_buff *skb)
 375 {
 376         struct dst_entry *dst = skb_dst(skb);
 377
 378         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 379         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 380
 381 #ifdef CONFIG_NET_SWITCHDEV
 382         if (skb->offload_l3_fwd_mark) {
 383                 consume_skb(skb);
 384                 return 0;
 385         }
 386 #endif
 387
 388         return dst_output(net, sk, skb);
 389 }
 390
 391 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 392 {
 393         if (skb->len <= mtu)
 394                 return false;
 395
 396         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 397         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 398                 return true;
 399
 400         if (skb->ignore_df)
 401                 return false;
 402
 403         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 404                 return false;
 405
 406         return true;
 407 }
 408
 409 int ip6_forward(struct sk_buff *skb)
 410 {
 411         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 412         struct dst_entry *dst = skb_dst(skb);
 413         struct ipv6hdr *hdr = ipv6_hdr(skb);
 414         struct inet6_skb_parm *opt = IP6CB(skb);
 415         struct net *net = dev_net(dst->dev);
 416         u32 mtu;
 417
 418         if (net->ipv6.devconf_all->forwarding == 0)
 419                 goto error;
 420
 421         if (skb->pkt_type != PACKET_HOST)
 422                 goto drop;
 423
 424         if (unlikely(skb->sk))
 425                 goto drop;
 426
 427         if (skb_warn_if_lro(skb))
 428                 goto drop;
 429
 430         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 431                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 432                 goto drop;
 433         }
 434
 435         skb_forward_csum(skb);
 436
 437         /*
 438          *      We DO NOT make any processing on
 439          *      RA packets, pushing them to user level AS IS
 440          *      without ane WARRANTY that application will be able
 441          *      to interpret them. The reason is that we
 442          *      cannot make anything clever here.
 443          *
 444          *      We are not end-node, so that if packet contains
 445          *      AH/ESP, we cannot make anything.
 446          *      Defragmentation also would be mistake, RA packets
 447          *      cannot be fragmented, because there is no warranty
 448          *      that different fragments will go along one path. --ANK
 449          */
 450         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 451                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 452                         return 0;
 453         }
 454
 455         /*
 456          *      check and decrement ttl
 457          */
 458         if (hdr->hop_limit <= 1) {
 459                 /* Force OUTPUT device used as source address */
 460                 skb->dev = dst->dev;
 461                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 462                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 463
 464                 kfree_skb(skb);
 465                 return -ETIMEDOUT;
 466         }
 467
 468         /* XXX: idev->cnf.proxy_ndp? */
 469         if (net->ipv6.devconf_all->proxy_ndp &&
 470             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 471                 int proxied = ip6_forward_proxy_check(skb);
 472                 if (proxied > 0)
 473                         return ip6_input(skb);
 474                 else if (proxied < 0) {
 475                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 476                         goto drop;
 477                 }
 478         }
 479
 480         if (!xfrm6_route_forward(skb)) {
 481                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 482                 goto drop;
 483         }
 484         dst = skb_dst(skb);
 485
 486         /* IPv6 specs say nothing about it, but it is clear that we cannot
 487            send redirects to source routed frames.
 488            We don't send redirects to frames decapsulated from IPsec.
 489          */
 490         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 491             opt->srcrt == 0 && !skb_sec_path(skb)) {
 492                 struct in6_addr *target = NULL;
 493                 struct inet_peer *peer;
 494                 struct rt6_info *rt;
 495
 496                 /*
 497                  *      incoming and outgoing devices are the same
 498                  *      send a redirect.
 499                  */
 500
 501                 rt = (struct rt6_info *) dst;
 502                 if (rt->rt6i_flags & RTF_GATEWAY)
 503                         target = &rt->rt6i_gateway;
 504                 else
 505                         target = &hdr->daddr;
 506
 507                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 508
 509                 /* Limit redirects both by destination (here)
 510                    and by source (inside ndisc_send_redirect)
 511                  */
 512                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 513                         ndisc_send_redirect(skb, target);
 514                 if (peer)
 515                         inet_putpeer(peer);
 516         } else {
 517                 int addrtype = ipv6_addr_type(&hdr->saddr);
 518
 519                 /* This check is security critical. */
 520                 if (addrtype == IPV6_ADDR_ANY ||
 521                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 522                         goto error;
 523                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 524                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 525                                     ICMPV6_NOT_NEIGHBOUR, 0);
 526                         goto error;
 527                 }
 528         }
 529
 530         mtu = ip6_dst_mtu_forward(dst);
 531         if (mtu < IPV6_MIN_MTU)
 532                 mtu = IPV6_MIN_MTU;
 533
 534         if (ip6_pkt_too_big(skb, mtu)) {
 535                 /* Again, force OUTPUT device used as source address */
 536                 skb->dev = dst->dev;
 537                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 538                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 539                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 540                                 IPSTATS_MIB_FRAGFAILS);
 541                 kfree_skb(skb);
 542                 return -EMSGSIZE;
 543         }
 544
 545         if (skb_cow(skb, dst->dev->hard_header_len)) {
 546                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 547                                 IPSTATS_MIB_OUTDISCARDS);
 548                 goto drop;
 549         }
 550
 551         hdr = ipv6_hdr(skb);
 552
 553         /* Mangling hops number delayed to point after skb COW */
 554
 555         hdr->hop_limit--;
 556
 557         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 558                        net, NULL, skb, skb->dev, dst->dev,
 559                        ip6_forward_finish);
 560
 561 error:
 562         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 563 drop:
 564         kfree_skb(skb);
 565         return -EINVAL;
 566 }
 567
 568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 569 {
 570         to->pkt_type = from->pkt_type;
 571         to->priority = from->priority;
 572         to->protocol = from->protocol;
 573         skb_dst_drop(to);
 574         skb_dst_set(to, dst_clone(skb_dst(from)));
 575         to->dev = from->dev;
 576         to->mark = from->mark;
 577
 578         skb_copy_hash(to, from);
 579
 580 #ifdef CONFIG_NET_SCHED
 581         to->tc_index = from->tc_index;
 582 #endif
 583         nf_copy(to, from);
 584         skb_copy_secmark(to, from);
 585 }
 586
 587 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 588                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 589 {
 590         struct sk_buff *frag;
 591         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 592         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 593                                 inet6_sk(skb->sk) : NULL;
 594         struct ipv6hdr *tmp_hdr;
 595         struct frag_hdr *fh;
 596         unsigned int mtu, hlen, left, len;
 597         int hroom, troom;
 598         __be32 frag_id;
 599         int ptr, offset = 0, err = 0;
 600         u8 *prevhdr, nexthdr = 0;
 601
 602         err = ip6_find_1stfragopt(skb, &prevhdr);
 603         if (err < 0)
 604                 goto fail;
 605         hlen = err;
 606         nexthdr = *prevhdr;
 607
 608         mtu = ip6_skb_dst_mtu(skb);
 609
 610         /* We must not fragment if the socket is set to force MTU discovery
 611          * or if the skb it not generated by a local socket.
 612          */
 613         if (unlikely(!skb->ignore_df && skb->len > mtu))
 614                 goto fail_toobig;
 615
 616         if (IP6CB(skb)->frag_max_size) {
 617                 if (IP6CB(skb)->frag_max_size > mtu)
 618                         goto fail_toobig;
 619
 620                 /* don't send fragments larger than what we received */
 621                 mtu = IP6CB(skb)->frag_max_size;
 622                 if (mtu < IPV6_MIN_MTU)
 623                         mtu = IPV6_MIN_MTU;
 624         }
 625
 626         if (np && np->frag_size < mtu) {
 627                 if (np->frag_size)
 628                         mtu = np->frag_size;
 629         }
 630         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 631                 goto fail_toobig;
 632         mtu -= hlen + sizeof(struct frag_hdr);
 633
 634         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 635                                     &ipv6_hdr(skb)->saddr);
 636
 637         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 638             (err = skb_checksum_help(skb)))
 639                 goto fail;
 640
 641         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 642         if (skb_has_frag_list(skb)) {
 643                 unsigned int first_len = skb_pagelen(skb);
 644                 struct sk_buff *frag2;
 645
 646                 if (first_len - hlen > mtu ||
 647                     ((first_len - hlen) & 7) ||
 648                     skb_cloned(skb) ||
 649                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 650                         goto slow_path;
 651
 652                 skb_walk_frags(skb, frag) {
 653                         /* Correct geometry. */
 654                         if (frag->len > mtu ||
 655                             ((frag->len & 7) && frag->next) ||
 656                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 657                                 goto slow_path_clean;
 658
 659                         /* Partially cloned skb? */
 660                         if (skb_shared(frag))
 661                                 goto slow_path_clean;
 662
 663                         BUG_ON(frag->sk);
 664                         if (skb->sk) {
 665                                 frag->sk = skb->sk;
 666                                 frag->destructor = sock_wfree;
 667                         }
 668                         skb->truesize -= frag->truesize;
 669                 }
 670
 671                 err = 0;
 672                 offset = 0;
 673                 /* BUILD HEADER */
 674
 675                 *prevhdr = NEXTHDR_FRAGMENT;
 676                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 677                 if (!tmp_hdr) {
 678                         err = -ENOMEM;
 679                         goto fail;
 680                 }
 681                 frag = skb_shinfo(skb)->frag_list;
 682                 skb_frag_list_init(skb);
 683
 684                 __skb_pull(skb, hlen);
 685                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 686                 __skb_push(skb, hlen);
 687                 skb_reset_network_header(skb);
 688                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 689
 690                 fh->nexthdr = nexthdr;
 691                 fh->reserved = 0;
 692                 fh->frag_off = htons(IP6_MF);
 693                 fh->identification = frag_id;
 694
 695                 first_len = skb_pagelen(skb);
 696                 skb->data_len = first_len - skb_headlen(skb);
 697                 skb->len = first_len;
 698                 ipv6_hdr(skb)->payload_len = htons(first_len -
 699                                                    sizeof(struct ipv6hdr));
 700
 701                 for (;;) {
 702                         /* Prepare header of the next frame,
 703                          * before previous one went down. */
 704                         if (frag) {
 705                                 frag->ip_summed = CHECKSUM_NONE;
 706                                 skb_reset_transport_header(frag);
 707                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 708                                 __skb_push(frag, hlen);
 709                                 skb_reset_network_header(frag);
 710                                 memcpy(skb_network_header(frag), tmp_hdr,
 711                                        hlen);
 712                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 713                                 fh->nexthdr = nexthdr;
 714                                 fh->reserved = 0;
 715                                 fh->frag_off = htons(offset);
 716                                 if (frag->next)
 717                                         fh->frag_off |= htons(IP6_MF);
 718                                 fh->identification = frag_id;
 719                                 ipv6_hdr(frag)->payload_len =
 720                                                 htons(frag->len -
 721                                                       sizeof(struct ipv6hdr));
 722                                 ip6_copy_metadata(frag, skb);
 723                         }
 724
 725                         err = output(net, sk, skb);
 726                         if (!err)
 727                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 728                                               IPSTATS_MIB_FRAGCREATES);
 729
 730                         if (err || !frag)
 731                                 break;
 732
 733                         skb = frag;
 734                         frag = skb->next;
 735                         skb_mark_not_on_list(skb);
 736                 }
 737
 738                 kfree(tmp_hdr);
 739
 740                 if (err == 0) {
 741                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 742                                       IPSTATS_MIB_FRAGOKS);
 743                         return 0;
 744                 }
 745
 746                 kfree_skb_list(frag);
 747
 748                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 749                               IPSTATS_MIB_FRAGFAILS);
 750                 return err;
 751
 752 slow_path_clean:
 753                 skb_walk_frags(skb, frag2) {
 754                         if (frag2 == frag)
 755                                 break;
 756                         frag2->sk = NULL;
 757                         frag2->destructor = NULL;
 758                         skb->truesize += frag2->truesize;
 759                 }
 760         }
 761
 762 slow_path:
 763         left = skb->len - hlen;         /* Space per frame */
 764         ptr = hlen;                     /* Where to start from */
 765
 766         /*
 767          *      Fragment the datagram.
 768          */
 769
 770         troom = rt->dst.dev->needed_tailroom;
 771
 772         /*
 773          *      Keep copying data until we run out.
 774          */
 775         while (left > 0)        {
 776                 u8 *fragnexthdr_offset;
 777
 778                 len = left;
 779                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 780                 if (len > mtu)
 781                         len = mtu;
 782                 /* IF: we are not sending up to and including the packet end
 783                    then align the next start on an eight byte boundary */
 784                 if (len < left) {
 785                         len &= ~7;
 786                 }
 787
 788                 /* Allocate buffer */
 789                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 790                                  hroom + troom, GFP_ATOMIC);
 791                 if (!frag) {
 792                         err = -ENOMEM;
 793                         goto fail;
 794                 }
 795
 796                 /*
 797                  *      Set up data on packet
 798                  */
 799
 800                 ip6_copy_metadata(frag, skb);
 801                 skb_reserve(frag, hroom);
 802                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 803                 skb_reset_network_header(frag);
 804                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 805                 frag->transport_header = (frag->network_header + hlen +
 806                                           sizeof(struct frag_hdr));
 807
 808                 /*
 809                  *      Charge the memory for the fragment to any owner
 810                  *      it might possess
 811                  */
 812                 if (skb->sk)
 813                         skb_set_owner_w(frag, skb->sk);
 814
 815                 /*
 816                  *      Copy the packet header into the new buffer.
 817                  */
 818                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 819
 820                 fragnexthdr_offset = skb_network_header(frag);
 821                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 822                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 823
 824                 /*
 825                  *      Build fragment header.
 826                  */
 827                 fh->nexthdr = nexthdr;
 828                 fh->reserved = 0;
 829                 fh->identification = frag_id;
 830
 831                 /*
 832                  *      Copy a block of the IP datagram.
 833                  */
 834                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 835                                      len));
 836                 left -= len;
 837
 838                 fh->frag_off = htons(offset);
 839                 if (left > 0)
 840                         fh->frag_off |= htons(IP6_MF);
 841                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 842                                                     sizeof(struct ipv6hdr));
 843
 844                 ptr += len;
 845                 offset += len;
 846
 847                 /*
 848                  *      Put this fragment into the sending queue.
 849                  */
 850                 err = output(net, sk, frag);
 851                 if (err)
 852                         goto fail;
 853
 854                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 855                               IPSTATS_MIB_FRAGCREATES);
 856         }
 857         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 858                       IPSTATS_MIB_FRAGOKS);
 859         consume_skb(skb);
 860         return err;
 861
 862 fail_toobig:
 863         if (skb->sk && dst_allfrag(skb_dst(skb)))
 864                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 865
 866         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 867         err = -EMSGSIZE;
 868
 869 fail:
 870         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 871                       IPSTATS_MIB_FRAGFAILS);
 872         kfree_skb(skb);
 873         return err;
 874 }
 875
 876 static inline int ip6_rt_check(const struct rt6key *rt_key,
 877                                const struct in6_addr *fl_addr,
 878                                const struct in6_addr *addr_cache)
 879 {
 880         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 881                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 882 }
 883
 884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 885                                           struct dst_entry *dst,
 886                                           const struct flowi6 *fl6)
 887 {
 888         struct ipv6_pinfo *np = inet6_sk(sk);
 889         struct rt6_info *rt;
 890
 891         if (!dst)
 892                 goto out;
 893
 894         if (dst->ops->family != AF_INET6) {
 895                 dst_release(dst);
 896                 return NULL;
 897         }
 898
 899         rt = (struct rt6_info *)dst;
 900         /* Yes, checking route validity in not connected
 901          * case is not very simple. Take into account,
 902          * that we do not support routing by source, TOS,
 903          * and MSG_DONTROUTE            --ANK (980726)
 904          *
 905          * 1. ip6_rt_check(): If route was host route,
 906          *    check that cached destination is current.
 907          *    If it is network route, we still may
 908          *    check its validity using saved pointer
 909          *    to the last used address: daddr_cache.
 910          *    We do not want to save whole address now,
 911          *    (because main consumer of this service
 912          *    is tcp, which has not this problem),
 913          *    so that the last trick works only on connected
 914          *    sockets.
 915          * 2. oif also should be the same.
 916          */
 917         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 918 #ifdef CONFIG_IPV6_SUBTREES
 919             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 920 #endif
 921            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 922               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 923                 dst_release(dst);
 924                 dst = NULL;
 925         }
 926
 927 out:
 928         return dst;
 929 }
 930
 931 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 932                                struct dst_entry **dst, struct flowi6 *fl6)
 933 {
 934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 935         struct neighbour *n;
 936         struct rt6_info *rt;
 937 #endif
 938         int err;
 939         int flags = 0;
 940
 941         /* The correct way to handle this would be to do
 942          * ip6_route_get_saddr, and then ip6_route_output; however,
 943          * the route-specific preferred source forces the
 944          * ip6_route_output call _before_ ip6_route_get_saddr.
 945          *
 946          * In source specific routing (no src=any default route),
 947          * ip6_route_output will fail given src=any saddr, though, so
 948          * that's why we try it again later.
 949          */
 950         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 951                 struct fib6_info *from;
 952                 struct rt6_info *rt;
 953                 bool had_dst = *dst != NULL;
 954
 955                 if (!had_dst)
 956                         *dst = ip6_route_output(net, sk, fl6);
 957                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 958
 959                 rcu_read_lock();
 960                 from = rt ? rcu_dereference(rt->from) : NULL;
 961                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 962                                           sk ? inet6_sk(sk)->srcprefs : 0,
 963                                           &fl6->saddr);
 964                 rcu_read_unlock();
 965
 966                 if (err)
 967                         goto out_err_release;
 968
 969                 /* If we had an erroneous initial result, pretend it
 970                  * never existed and let the SA-enabled version take
 971                  * over.
 972                  */
 973                 if (!had_dst && (*dst)->error) {
 974                         dst_release(*dst);
 975                         *dst = NULL;
 976                 }
 977
 978                 if (fl6->flowi6_oif)
 979                         flags |= RT6_LOOKUP_F_IFACE;
 980         }
 981
 982         if (!*dst)
 983                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 984
 985         err = (*dst)->error;
 986         if (err)
 987                 goto out_err_release;
 988
 989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 990         /*
 991          * Here if the dst entry we've looked up
 992          * has a neighbour entry that is in the INCOMPLETE
 993          * state and the src address from the flow is
 994          * marked as OPTIMISTIC, we release the found
 995          * dst entry and replace it instead with the
 996          * dst entry of the nexthop router
 997          */
 998         rt = (struct rt6_info *) *dst;
 999         rcu_read_lock_bh();
1000         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1001                                       rt6_nexthop(rt, &fl6->daddr));
1002         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1003         rcu_read_unlock_bh();
1004
1005         if (err) {
1006                 struct inet6_ifaddr *ifp;
1007                 struct flowi6 fl_gw6;
1008                 int redirect;
1009
1010                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1011                                       (*dst)->dev, 1);
1012
1013                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1014                 if (ifp)
1015                         in6_ifa_put(ifp);
1016
1017                 if (redirect) {
1018                         /*
1019                          * We need to get the dst entry for the
1020                          * default router instead
1021                          */
1022                         dst_release(*dst);
1023                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1024                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1025                         *dst = ip6_route_output(net, sk, &fl_gw6);
1026                         err = (*dst)->error;
1027                         if (err)
1028                                 goto out_err_release;
1029                 }
1030         }
1031 #endif
1032         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1033             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1034                 err = -EAFNOSUPPORT;
1035                 goto out_err_release;
1036         }
1037
1038         return 0;
1039
1040 out_err_release:
1041         dst_release(*dst);
1042         *dst = NULL;
1043
1044         if (err == -ENETUNREACH)
1045                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1046         return err;
1047 }
1048
1049 /**
1050  *      ip6_dst_lookup - perform route lookup on flow
1051  *      @sk: socket which provides route info
1052  *      @dst: pointer to dst_entry * for result
1053  *      @fl6: flow to lookup
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns zero on success, or a standard errno code on error.
1058  */
1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1060                    struct flowi6 *fl6)
1061 {
1062         *dst = NULL;
1063         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1066
1067 /**
1068  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1069  *      @sk: socket which provides route info
1070  *      @fl6: flow to lookup
1071  *      @final_dst: final destination address for ipsec lookup
1072  *
1073  *      This function performs a route lookup on the given flow.
1074  *
1075  *      It returns a valid dst pointer on success, or a pointer encoded
1076  *      error code.
1077  */
1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1079                                       const struct in6_addr *final_dst)
1080 {
1081         struct dst_entry *dst = NULL;
1082         int err;
1083
1084         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1085         if (err)
1086                 return ERR_PTR(err);
1087         if (final_dst)
1088                 fl6->daddr = *final_dst;
1089
1090         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1091 }
1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1093
1094 /**
1095  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1096  *      @sk: socket which provides the dst cache and route info
1097  *      @fl6: flow to lookup
1098  *      @final_dst: final destination address for ipsec lookup
1099  *      @connected: whether @sk is connected or not
1100  *
1101  *      This function performs a route lookup on the given flow with the
1102  *      possibility of using the cached route in the socket if it is valid.
1103  *      It will take the socket dst lock when operating on the dst cache.
1104  *      As a result, this function can only be used in process context.
1105  *
1106  *      In addition, for a connected socket, cache the dst in the socket
1107  *      if the current cache is not valid.
1108  *
1109  *      It returns a valid dst pointer on success, or a pointer encoded
1110  *      error code.
1111  */
1112 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1113                                          const struct in6_addr *final_dst,
1114                                          bool connected)
1115 {
1116         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1117
1118         dst = ip6_sk_dst_check(sk, dst, fl6);
1119         if (dst)
1120                 return dst;
1121
1122         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1123         if (connected && !IS_ERR(dst))
1124                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1125
1126         return dst;
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131                                                gfp_t gfp)
1132 {
1133         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137                                                 gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143                                 int *maxfraglen,
1144                                 unsigned int fragheaderlen,
1145                                 struct sk_buff *skb,
1146                                 struct rt6_info *rt,
1147                                 unsigned int orig_mtu)
1148 {
1149         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150                 if (!skb) {
1151                         /* first fragment, reserve header_len */
1152                         *mtu = orig_mtu - rt->dst.header_len;
1153
1154                 } else {
1155                         /*
1156                          * this fragment is not first, the headers
1157                          * space is regarded as data space.
1158                          */
1159                         *mtu = orig_mtu;
1160                 }
1161                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162                               + fragheaderlen - sizeof(struct frag_hdr);
1163         }
1164 }
1165
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168                           struct rt6_info *rt, struct flowi6 *fl6)
1169 {
1170         struct ipv6_pinfo *np = inet6_sk(sk);
1171         unsigned int mtu;
1172         struct ipv6_txoptions *opt = ipc6->opt;
1173
1174         /*
1175          * setup for corking
1176          */
1177         if (opt) {
1178                 if (WARN_ON(v6_cork->opt))
1179                         return -EINVAL;
1180
1181                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182                 if (unlikely(!v6_cork->opt))
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->tot_len = sizeof(*opt);
1186                 v6_cork->opt->opt_flen = opt->opt_flen;
1187                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200                                                    sk->sk_allocation);
1201                 if (opt->hopopt && !v6_cork->opt->hopopt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205                                                     sk->sk_allocation);
1206                 if (opt->srcrt && !v6_cork->opt->srcrt)
1207                         return -ENOBUFS;
1208
1209                 /* need source address above miyazawa*/
1210         }
1211         dst_hold(&rt->dst);
1212         cork->base.dst = &rt->dst;
1213         cork->fl.u.ip6 = *fl6;
1214         v6_cork->hop_limit = ipc6->hlimit;
1215         v6_cork->tclass = ipc6->tclass;
1216         if (rt->dst.flags & DST_XFRM_TUNNEL)
1217                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219         else
1220                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222         if (np->frag_size < mtu) {
1223                 if (np->frag_size)
1224                         mtu = np->frag_size;
1225         }
1226         if (mtu < IPV6_MIN_MTU)
1227                 return -EINVAL;
1228         cork->base.fragsize = mtu;
1229         cork->base.gso_size = ipc6->gso_size;
1230         cork->base.tx_flags = 0;
1231         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1232
1233         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1234                 cork->base.flags |= IPCORK_ALLFRAG;
1235         cork->base.length = 0;
1236
1237         cork->base.transmit_time = ipc6->sockc.transmit_time;
1238
1239         return 0;
1240 }
1241
1242 static int __ip6_append_data(struct sock *sk,
1243                              struct flowi6 *fl6,
1244                              struct sk_buff_head *queue,
1245                              struct inet_cork *cork,
1246                              struct inet6_cork *v6_cork,
1247                              struct page_frag *pfrag,
1248                              int getfrag(void *from, char *to, int offset,
1249                                          int len, int odd, struct sk_buff *skb),
1250                              void *from, int length, int transhdrlen,
1251                              unsigned int flags, struct ipcm6_cookie *ipc6)
1252 {
1253         struct sk_buff *skb, *skb_prev = NULL;
1254         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1255         struct ubuf_info *uarg = NULL;
1256         int exthdrlen = 0;
1257         int dst_exthdrlen = 0;
1258         int hh_len;
1259         int copy;
1260         int err;
1261         int offset = 0;
1262         u32 tskey = 0;
1263         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1264         struct ipv6_txoptions *opt = v6_cork->opt;
1265         int csummode = CHECKSUM_NONE;
1266         unsigned int maxnonfragsize, headersize;
1267         unsigned int wmem_alloc_delta = 0;
1268         bool paged, extra_uref;
1269
1270         skb = skb_peek_tail(queue);
1271         if (!skb) {
1272                 exthdrlen = opt ? opt->opt_flen : 0;
1273                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1274         }
1275
1276         paged = !!cork->gso_size;
1277         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1278         orig_mtu = mtu;
1279
1280         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1281             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1282                 tskey = sk->sk_tskey++;
1283
1284         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1285
1286         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1287                         (opt ? opt->opt_nflen : 0);
1288         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1289                      sizeof(struct frag_hdr);
1290
1291         headersize = sizeof(struct ipv6hdr) +
1292                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1293                      (dst_allfrag(&rt->dst) ?
1294                       sizeof(struct frag_hdr) : 0) +
1295                      rt->rt6i_nfheader_len;
1296
1297         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1298          * the first fragment
1299          */
1300         if (headersize + transhdrlen > mtu)
1301                 goto emsgsize;
1302
1303         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1304             (sk->sk_protocol == IPPROTO_UDP ||
1305              sk->sk_protocol == IPPROTO_RAW)) {
1306                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1307                                 sizeof(struct ipv6hdr));
1308                 goto emsgsize;
1309         }
1310
1311         if (ip6_sk_ignore_df(sk))
1312                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1313         else
1314                 maxnonfragsize = mtu;
1315
1316         if (cork->length + length > maxnonfragsize - headersize) {
1317 emsgsize:
1318                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1319                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1320                 return -EMSGSIZE;
1321         }
1322
1323         /* CHECKSUM_PARTIAL only with no extension headers and when
1324          * we are not going to fragment
1325          */
1326         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1327             headersize == sizeof(struct ipv6hdr) &&
1328             length <= mtu - headersize &&
1329             (!(flags & MSG_MORE) || cork->gso_size) &&
1330             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1331                 csummode = CHECKSUM_PARTIAL;
1332
1333         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1334                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1335                 if (!uarg)
1336                         return -ENOBUFS;
1337                 extra_uref = true;
1338                 if (rt->dst.dev->features & NETIF_F_SG &&
1339                     csummode == CHECKSUM_PARTIAL) {
1340                         paged = true;
1341                 } else {
1342                         uarg->zerocopy = 0;
1343                         skb_zcopy_set(skb, uarg, &extra_uref);
1344                 }
1345         }
1346
1347         /*
1348          * Let's try using as much space as possible.
1349          * Use MTU if total length of the message fits into the MTU.
1350          * Otherwise, we need to reserve fragment header and
1351          * fragment alignment (= 8-15 octects, in total).
1352          *
1353          * Note that we may need to "move" the data from the tail of
1354          * of the buffer to the new fragment when we split
1355          * the message.
1356          *
1357          * FIXME: It may be fragmented into multiple chunks
1358          *        at once if non-fragmentable extension headers
1359          *        are too large.
1360          * --yoshfuji
1361          */
1362
1363         cork->length += length;
1364         if (!skb)
1365                 goto alloc_new_skb;
1366
1367         while (length > 0) {
1368                 /* Check if the remaining data fits into current packet. */
1369                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1370                 if (copy < length)
1371                         copy = maxfraglen - skb->len;
1372
1373                 if (copy <= 0) {
1374                         char *data;
1375                         unsigned int datalen;
1376                         unsigned int fraglen;
1377                         unsigned int fraggap;
1378                         unsigned int alloclen;
1379                         unsigned int pagedlen;
1380 alloc_new_skb:
1381                         /* There's no room in the current skb */
1382                         if (skb)
1383                                 fraggap = skb->len - maxfraglen;
1384                         else
1385                                 fraggap = 0;
1386                         /* update mtu and maxfraglen if necessary */
1387                         if (!skb || !skb_prev)
1388                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1389                                                     fragheaderlen, skb, rt,
1390                                                     orig_mtu);
1391
1392                         skb_prev = skb;
1393
1394                         /*
1395                          * If remaining data exceeds the mtu,
1396                          * we know we need more fragment(s).
1397                          */
1398                         datalen = length + fraggap;
1399
1400                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402                         fraglen = datalen + fragheaderlen;
1403                         pagedlen = 0;
1404
1405                         if ((flags & MSG_MORE) &&
1406                             !(rt->dst.dev->features&NETIF_F_SG))
1407                                 alloclen = mtu;
1408                         else if (!paged)
1409                                 alloclen = fraglen;
1410                         else {
1411                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1412                                 pagedlen = fraglen - alloclen;
1413                         }
1414
1415                         alloclen += dst_exthdrlen;
1416
1417                         if (datalen != length + fraggap) {
1418                                 /*
1419                                  * this is not the last fragment, the trailer
1420                                  * space is regarded as data space.
1421                                  */
1422                                 datalen += rt->dst.trailer_len;
1423                         }
1424
1425                         alloclen += rt->dst.trailer_len;
1426                         fraglen = datalen + fragheaderlen;
1427
1428                         /*
1429                          * We just reserve space for fragment header.
1430                          * Note: this may be overallocation if the message
1431                          * (without MSG_MORE) fits into the MTU.
1432                          */
1433                         alloclen += sizeof(struct frag_hdr);
1434
1435                         copy = datalen - transhdrlen - fraggap - pagedlen;
1436                         if (copy < 0) {
1437                                 err = -EINVAL;
1438                                 goto error;
1439                         }
1440                         if (transhdrlen) {
1441                                 skb = sock_alloc_send_skb(sk,
1442                                                 alloclen + hh_len,
1443                                                 (flags & MSG_DONTWAIT), &err);
1444                         } else {
1445                                 skb = NULL;
1446                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1447                                     2 * sk->sk_sndbuf)
1448                                         skb = alloc_skb(alloclen + hh_len,
1449                                                         sk->sk_allocation);
1450                                 if (unlikely(!skb))
1451                                         err = -ENOBUFS;
1452                         }
1453                         if (!skb)
1454                                 goto error;
1455                         /*
1456                          *      Fill in the control structures
1457                          */
1458                         skb->protocol = htons(ETH_P_IPV6);
1459                         skb->ip_summed = csummode;
1460                         skb->csum = 0;
1461                         /* reserve for fragmentation and ipsec header */
1462                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1463                                     dst_exthdrlen);
1464
1465                         /*
1466                          *      Find where to start putting bytes
1467                          */
1468                         data = skb_put(skb, fraglen - pagedlen);
1469                         skb_set_network_header(skb, exthdrlen);
1470                         data += fragheaderlen;
1471                         skb->transport_header = (skb->network_header +
1472                                                  fragheaderlen);
1473                         if (fraggap) {
1474                                 skb->csum = skb_copy_and_csum_bits(
1475                                         skb_prev, maxfraglen,
1476                                         data + transhdrlen, fraggap, 0);
1477                                 skb_prev->csum = csum_sub(skb_prev->csum,
1478                                                           skb->csum);
1479                                 data += fraggap;
1480                                 pskb_trim_unique(skb_prev, maxfraglen);
1481                         }
1482                         if (copy > 0 &&
1483                             getfrag(from, data + transhdrlen, offset,
1484                                     copy, fraggap, skb) < 0) {
1485                                 err = -EFAULT;
1486                                 kfree_skb(skb);
1487                                 goto error;
1488                         }
1489
1490                         offset += copy;
1491                         length -= copy + transhdrlen;
1492                         transhdrlen = 0;
1493                         exthdrlen = 0;
1494                         dst_exthdrlen = 0;
1495
1496                         /* Only the initial fragment is time stamped */
1497                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1498                         cork->tx_flags = 0;
1499                         skb_shinfo(skb)->tskey = tskey;
1500                         tskey = 0;
1501                         skb_zcopy_set(skb, uarg, &extra_uref);
1502
1503                         if ((flags & MSG_CONFIRM) && !skb_prev)
1504                                 skb_set_dst_pending_confirm(skb, 1);
1505
1506                         /*
1507                          * Put the packet on the pending queue
1508                          */
1509                         if (!skb->destructor) {
1510                                 skb->destructor = sock_wfree;
1511                                 skb->sk = sk;
1512                                 wmem_alloc_delta += skb->truesize;
1513                         }
1514                         __skb_queue_tail(queue, skb);
1515                         continue;
1516                 }
1517
1518                 if (copy > length)
1519                         copy = length;
1520
1521                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1522                     skb_tailroom(skb) >= copy) {
1523                         unsigned int off;
1524
1525                         off = skb->len;
1526                         if (getfrag(from, skb_put(skb, copy),
1527                                                 offset, copy, off, skb) < 0) {
1528                                 __skb_trim(skb, off);
1529                                 err = -EFAULT;
1530                                 goto error;
1531                         }
1532                 } else if (!uarg || !uarg->zerocopy) {
1533                         int i = skb_shinfo(skb)->nr_frags;
1534
1535                         err = -ENOMEM;
1536                         if (!sk_page_frag_refill(sk, pfrag))
1537                                 goto error;
1538
1539                         if (!skb_can_coalesce(skb, i, pfrag->page,
1540                                               pfrag->offset)) {
1541                                 err = -EMSGSIZE;
1542                                 if (i == MAX_SKB_FRAGS)
1543                                         goto error;
1544
1545                                 __skb_fill_page_desc(skb, i, pfrag->page,
1546                                                      pfrag->offset, 0);
1547                                 skb_shinfo(skb)->nr_frags = ++i;
1548                                 get_page(pfrag->page);
1549                         }
1550                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1551                         if (getfrag(from,
1552                                     page_address(pfrag->page) + pfrag->offset,
1553                                     offset, copy, skb->len, skb) < 0)
1554                                 goto error_efault;
1555
1556                         pfrag->offset += copy;
1557                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1558                         skb->len += copy;
1559                         skb->data_len += copy;
1560                         skb->truesize += copy;
1561                         wmem_alloc_delta += copy;
1562                 } else {
1563                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1564                         if (err < 0)
1565                                 goto error;
1566                 }
1567                 offset += copy;
1568                 length -= copy;
1569         }
1570
1571         if (wmem_alloc_delta)
1572                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1573         return 0;
1574
1575 error_efault:
1576         err = -EFAULT;
1577 error:
1578         if (uarg)
1579                 sock_zerocopy_put_abort(uarg, extra_uref);
1580         cork->length -= length;
1581         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1582         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1583         return err;
1584 }
1585
1586 int ip6_append_data(struct sock *sk,
1587                     int getfrag(void *from, char *to, int offset, int len,
1588                                 int odd, struct sk_buff *skb),
1589                     void *from, int length, int transhdrlen,
1590                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1591                     struct rt6_info *rt, unsigned int flags)
1592 {
1593         struct inet_sock *inet = inet_sk(sk);
1594         struct ipv6_pinfo *np = inet6_sk(sk);
1595         int exthdrlen;
1596         int err;
1597
1598         if (flags&MSG_PROBE)
1599                 return 0;
1600         if (skb_queue_empty(&sk->sk_write_queue)) {
1601                 /*
1602                  * setup for corking
1603                  */
1604                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1605                                      ipc6, rt, fl6);
1606                 if (err)
1607                         return err;
1608
1609                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1610                 length += exthdrlen;
1611                 transhdrlen += exthdrlen;
1612         } else {
1613                 fl6 = &inet->cork.fl.u.ip6;
1614                 transhdrlen = 0;
1615         }
1616
1617         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1618                                  &np->cork, sk_page_frag(sk), getfrag,
1619                                  from, length, transhdrlen, flags, ipc6);
1620 }
1621 EXPORT_SYMBOL_GPL(ip6_append_data);
1622
1623 static void ip6_cork_release(struct inet_cork_full *cork,
1624                              struct inet6_cork *v6_cork)
1625 {
1626         if (v6_cork->opt) {
1627                 kfree(v6_cork->opt->dst0opt);
1628                 kfree(v6_cork->opt->dst1opt);
1629                 kfree(v6_cork->opt->hopopt);
1630                 kfree(v6_cork->opt->srcrt);
1631                 kfree(v6_cork->opt);
1632                 v6_cork->opt = NULL;
1633         }
1634
1635         if (cork->base.dst) {
1636                 dst_release(cork->base.dst);
1637                 cork->base.dst = NULL;
1638                 cork->base.flags &= ~IPCORK_ALLFRAG;
1639         }
1640         memset(&cork->fl, 0, sizeof(cork->fl));
1641 }
1642
1643 struct sk_buff *__ip6_make_skb(struct sock *sk,
1644                                struct sk_buff_head *queue,
1645                                struct inet_cork_full *cork,
1646                                struct inet6_cork *v6_cork)
1647 {
1648         struct sk_buff *skb, *tmp_skb;
1649         struct sk_buff **tail_skb;
1650         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1651         struct ipv6_pinfo *np = inet6_sk(sk);
1652         struct net *net = sock_net(sk);
1653         struct ipv6hdr *hdr;
1654         struct ipv6_txoptions *opt = v6_cork->opt;
1655         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1656         struct flowi6 *fl6 = &cork->fl.u.ip6;
1657         unsigned char proto = fl6->flowi6_proto;
1658
1659         skb = __skb_dequeue(queue);
1660         if (!skb)
1661                 goto out;
1662         tail_skb = &(skb_shinfo(skb)->frag_list);
1663
1664         /* move skb->data to ip header from ext header */
1665         if (skb->data < skb_network_header(skb))
1666                 __skb_pull(skb, skb_network_offset(skb));
1667         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1668                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1669                 *tail_skb = tmp_skb;
1670                 tail_skb = &(tmp_skb->next);
1671                 skb->len += tmp_skb->len;
1672                 skb->data_len += tmp_skb->len;
1673                 skb->truesize += tmp_skb->truesize;
1674                 tmp_skb->destructor = NULL;
1675                 tmp_skb->sk = NULL;
1676         }
1677
1678         /* Allow local fragmentation. */
1679         skb->ignore_df = ip6_sk_ignore_df(sk);
1680
1681         *final_dst = fl6->daddr;
1682         __skb_pull(skb, skb_network_header_len(skb));
1683         if (opt && opt->opt_flen)
1684                 ipv6_push_frag_opts(skb, opt, &proto);
1685         if (opt && opt->opt_nflen)
1686                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1687
1688         skb_push(skb, sizeof(struct ipv6hdr));
1689         skb_reset_network_header(skb);
1690         hdr = ipv6_hdr(skb);
1691
1692         ip6_flow_hdr(hdr, v6_cork->tclass,
1693                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1694                                         ip6_autoflowlabel(net, np), fl6));
1695         hdr->hop_limit = v6_cork->hop_limit;
1696         hdr->nexthdr = proto;
1697         hdr->saddr = fl6->saddr;
1698         hdr->daddr = *final_dst;
1699
1700         skb->priority = sk->sk_priority;
1701         skb->mark = sk->sk_mark;
1702
1703         skb->tstamp = cork->base.transmit_time;
1704
1705         skb_dst_set(skb, dst_clone(&rt->dst));
1706         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1707         if (proto == IPPROTO_ICMPV6) {
1708                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1709
1710                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1711                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1712         }
1713
1714         ip6_cork_release(cork, v6_cork);
1715 out:
1716         return skb;
1717 }
1718
1719 int ip6_send_skb(struct sk_buff *skb)
1720 {
1721         struct net *net = sock_net(skb->sk);
1722         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1723         int err;
1724
1725         err = ip6_local_out(net, skb->sk, skb);
1726         if (err) {
1727                 if (err > 0)
1728                         err = net_xmit_errno(err);
1729                 if (err)
1730                         IP6_INC_STATS(net, rt->rt6i_idev,
1731                                       IPSTATS_MIB_OUTDISCARDS);
1732         }
1733
1734         return err;
1735 }
1736
1737 int ip6_push_pending_frames(struct sock *sk)
1738 {
1739         struct sk_buff *skb;
1740
1741         skb = ip6_finish_skb(sk);
1742         if (!skb)
1743                 return 0;
1744
1745         return ip6_send_skb(skb);
1746 }
1747 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1748
1749 static void __ip6_flush_pending_frames(struct sock *sk,
1750                                        struct sk_buff_head *queue,
1751                                        struct inet_cork_full *cork,
1752                                        struct inet6_cork *v6_cork)
1753 {
1754         struct sk_buff *skb;
1755
1756         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1757                 if (skb_dst(skb))
1758                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1759                                       IPSTATS_MIB_OUTDISCARDS);
1760                 kfree_skb(skb);
1761         }
1762
1763         ip6_cork_release(cork, v6_cork);
1764 }
1765
1766 void ip6_flush_pending_frames(struct sock *sk)
1767 {
1768         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1769                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1770 }
1771 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1772
1773 struct sk_buff *ip6_make_skb(struct sock *sk,
1774                              int getfrag(void *from, char *to, int offset,
1775                                          int len, int odd, struct sk_buff *skb),
1776                              void *from, int length, int transhdrlen,
1777                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1778                              struct rt6_info *rt, unsigned int flags,
1779                              struct inet_cork_full *cork)
1780 {
1781         struct inet6_cork v6_cork;
1782         struct sk_buff_head queue;
1783         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1784         int err;
1785
1786         if (flags & MSG_PROBE)
1787                 return NULL;
1788
1789         __skb_queue_head_init(&queue);
1790
1791         cork->base.flags = 0;
1792         cork->base.addr = 0;
1793         cork->base.opt = NULL;
1794         cork->base.dst = NULL;
1795         v6_cork.opt = NULL;
1796         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1797         if (err) {
1798                 ip6_cork_release(cork, &v6_cork);
1799                 return ERR_PTR(err);
1800         }
1801         if (ipc6->dontfrag < 0)
1802                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1803
1804         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1805                                 &current->task_frag, getfrag, from,
1806                                 length + exthdrlen, transhdrlen + exthdrlen,
1807                                 flags, ipc6);
1808         if (err) {
1809                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1810                 return ERR_PTR(err);
1811         }
1812
1813         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1814 }