net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct inet6_dev *idev = ip6_dst_idev(dst);
  64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65         const struct in6_addr *daddr, *nexthop;
  66         struct ipv6hdr *hdr;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         /* Be paranoid, rather than too clever. */
  71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                 skb = skb_expand_head(skb, hh_len);
  73                 if (!skb) {
  74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                         return -ENOMEM;
  76                 }
  77         }
  78
  79         hdr = ipv6_hdr(skb);
  80         daddr = &hdr->daddr;
  81         if (ipv6_addr_is_multicast(daddr)) {
  82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                     ((mroute6_is_socket(net, skb) &&
  84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                         /* Do not check for IFF_ALLMULTI; multicast routing
  89                            is not supported in any case.
  90                          */
  91                         if (newskb)
  92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                         net, sk, newskb, NULL, newskb->dev,
  94                                         dev_loopback_xmit);
  95
  96                         if (hdr->hop_limit == 0) {
  97                                 IP6_INC_STATS(net, idev,
  98                                               IPSTATS_MIB_OUTDISCARDS);
  99                                 kfree_skb(skb);
 100                                 return 0;
 101                         }
 102                 }
 103
 104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                     !(dev->flags & IFF_LOOPBACK)) {
 107                         kfree_skb(skb);
 108                         return 0;
 109                 }
 110         }
 111
 112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                 int res = lwtunnel_xmit(skb);
 114
 115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                         return res;
 117         }
 118
 119         rcu_read_lock_bh();
 120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122         if (unlikely(!neigh))
 123                 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124         if (!IS_ERR(neigh)) {
 125                 sock_confirm_neigh(skb, neigh);
 126                 ret = neigh_output(neigh, skb, false);
 127                 rcu_read_unlock_bh();
 128                 return ret;
 129         }
 130         rcu_read_unlock_bh();
 131
 132         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb(skb);
 134         return -EINVAL;
 135 }
 136
 137 static int
 138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                     struct sk_buff *skb, unsigned int mtu)
 140 {
 141         struct sk_buff *segs, *nskb;
 142         netdev_features_t features;
 143         int ret = 0;
 144
 145         /* Please see corresponding comment in ip_finish_output_gso
 146          * describing the cases where GSO segment length exceeds the
 147          * egress MTU.
 148          */
 149         features = netif_skb_features(skb);
 150         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151         if (IS_ERR_OR_NULL(segs)) {
 152                 kfree_skb(skb);
 153                 return -ENOMEM;
 154         }
 155
 156         consume_skb(skb);
 157
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 int err;
 160
 161                 skb_mark_not_on_list(segs);
 162                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                 if (err && ret == 0)
 164                         ret = err;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171 {
 172         unsigned int mtu;
 173
 174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175         /* Policy lookup after SNAT yielded a new policy */
 176         if (skb_dst(skb)->xfrm) {
 177                 IPCB(skb)->flags |= IPSKB_REROUTED;
 178                 return dst_output(net, sk, skb);
 179         }
 180 #endif
 181
 182         mtu = ip6_skb_dst_mtu(skb);
 183         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187             dst_allfrag(skb_dst(skb)) ||
 188             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190         else
 191                 return ip6_finish_output2(net, sk, skb);
 192 }
 193
 194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195 {
 196         int ret;
 197
 198         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199         switch (ret) {
 200         case NET_XMIT_SUCCESS:
 201                 return __ip6_finish_output(net, sk, skb);
 202         case NET_XMIT_CN:
 203                 return __ip6_finish_output(net, sk, skb) ? : ret;
 204         default:
 205                 kfree_skb(skb);
 206                 return ret;
 207         }
 208 }
 209
 210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211 {
 212         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215         skb->protocol = htons(ETH_P_IPV6);
 216         skb->dev = dev;
 217
 218         if (unlikely(idev->cnf.disable_ipv6)) {
 219                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                 kfree_skb(skb);
 221                 return 0;
 222         }
 223
 224         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                             net, sk, skb, indev, dev,
 226                             ip6_finish_output,
 227                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228 }
 229 EXPORT_SYMBOL(ip6_output);
 230
 231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232 {
 233         if (!np->autoflowlabel_set)
 234                 return ip6_default_np_autolabel(net);
 235         else
 236                 return np->autoflowlabel;
 237 }
 238
 239 /*
 240  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241  * Note : socket lock is not held for SYNACK packets, but might be modified
 242  * by calls to skb_set_owner_w() and ipv6_local_error(),
 243  * which are using proper atomic operations or spinlocks.
 244  */
 245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247 {
 248         struct net *net = sock_net(sk);
 249         const struct ipv6_pinfo *np = inet6_sk(sk);
 250         struct in6_addr *first_hop = &fl6->daddr;
 251         struct dst_entry *dst = skb_dst(skb);
 252         struct net_device *dev = dst->dev;
 253         struct inet6_dev *idev = ip6_dst_idev(dst);
 254         unsigned int head_room;
 255         struct ipv6hdr *hdr;
 256         u8  proto = fl6->flowi6_proto;
 257         int seg_len = skb->len;
 258         int hlimit = -1;
 259         u32 mtu;
 260
 261         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262         if (opt)
 263                 head_room += opt->opt_nflen + opt->opt_flen;
 264
 265         if (unlikely(head_room > skb_headroom(skb))) {
 266                 skb = skb_expand_head(skb, head_room);
 267                 if (!skb) {
 268                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                         return -ENOBUFS;
 270                 }
 271         }
 272
 273         if (opt) {
 274                 seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                 if (opt->opt_flen)
 277                         ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                 if (opt->opt_nflen)
 280                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                              &fl6->saddr);
 282         }
 283
 284         skb_push(skb, sizeof(struct ipv6hdr));
 285         skb_reset_network_header(skb);
 286         hdr = ipv6_hdr(skb);
 287
 288         /*
 289          *      Fill in the IPv6 header
 290          */
 291         if (np)
 292                 hlimit = np->hop_limit;
 293         if (hlimit < 0)
 294                 hlimit = ip6_dst_hoplimit(dst);
 295
 296         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                 ip6_autoflowlabel(net, np), fl6));
 298
 299         hdr->payload_len = htons(seg_len);
 300         hdr->nexthdr = proto;
 301         hdr->hop_limit = hlimit;
 302
 303         hdr->saddr = fl6->saddr;
 304         hdr->daddr = *first_hop;
 305
 306         skb->protocol = htons(ETH_P_IPV6);
 307         skb->priority = priority;
 308         skb->mark = mark;
 309
 310         mtu = dst_mtu(dst);
 311         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                 /* if egress device is enslaved to an L3 master device pass the
 315                  * skb to its handler for processing
 316                  */
 317                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                 if (unlikely(!skb))
 319                         return 0;
 320
 321                 /* hooks should never assume socket lock is held.
 322                  * we promote our socket to non const
 323                  */
 324                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                                net, (struct sock *)sk, skb, NULL, dev,
 326                                dst_output);
 327         }
 328
 329         skb->dev = dev;
 330         /* ipv6_local_error() does not require socket lock,
 331          * we promote our socket to non const
 332          */
 333         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336         kfree_skb(skb);
 337         return -EMSGSIZE;
 338 }
 339 EXPORT_SYMBOL(ip6_xmit);
 340
 341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342 {
 343         struct ip6_ra_chain *ra;
 344         struct sock *last = NULL;
 345
 346         read_lock(&ip6_ra_lock);
 347         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                 struct sock *sk = ra->sk;
 349                 if (sk && ra->sel == sel &&
 350                     (!sk->sk_bound_dev_if ||
 351                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                         struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                         if (np && np->rtalert_isolate &&
 355                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                 continue;
 357                         }
 358                         if (last) {
 359                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                 if (skb2)
 361                                         rawv6_rcv(last, skb2);
 362                         }
 363                         last = sk;
 364                 }
 365         }
 366
 367         if (last) {
 368                 rawv6_rcv(last, skb);
 369                 read_unlock(&ip6_ra_lock);
 370                 return 1;
 371         }
 372         read_unlock(&ip6_ra_lock);
 373         return 0;
 374 }
 375
 376 static int ip6_forward_proxy_check(struct sk_buff *skb)
 377 {
 378         struct ipv6hdr *hdr = ipv6_hdr(skb);
 379         u8 nexthdr = hdr->nexthdr;
 380         __be16 frag_off;
 381         int offset;
 382
 383         if (ipv6_ext_hdr(nexthdr)) {
 384                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                 if (offset < 0)
 386                         return 0;
 387         } else
 388                 offset = sizeof(struct ipv6hdr);
 389
 390         if (nexthdr == IPPROTO_ICMPV6) {
 391                 struct icmp6hdr *icmp6;
 392
 393                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                          offset + 1 - skb->data)))
 395                         return 0;
 396
 397                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                 switch (icmp6->icmp6_type) {
 400                 case NDISC_ROUTER_SOLICITATION:
 401                 case NDISC_ROUTER_ADVERTISEMENT:
 402                 case NDISC_NEIGHBOUR_SOLICITATION:
 403                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                 case NDISC_REDIRECT:
 405                         /* For reaction involving unicast neighbor discovery
 406                          * message destined to the proxied address, pass it to
 407                          * input function.
 408                          */
 409                         return 1;
 410                 default:
 411                         break;
 412                 }
 413         }
 414
 415         /*
 416          * The proxying router can't forward traffic sent to a link-local
 417          * address, so signal the sender and discard the packet. This
 418          * behavior is clarified by the MIPv6 specification.
 419          */
 420         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                 dst_link_failure(skb);
 422                 return -1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                      struct sk_buff *skb)
 430 {
 431         struct dst_entry *dst = skb_dst(skb);
 432
 433         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436 #ifdef CONFIG_NET_SWITCHDEV
 437         if (skb->offload_l3_fwd_mark) {
 438                 consume_skb(skb);
 439                 return 0;
 440         }
 441 #endif
 442
 443         skb->tstamp = 0;
 444         return dst_output(net, sk, skb);
 445 }
 446
 447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448 {
 449         if (skb->len <= mtu)
 450                 return false;
 451
 452         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                 return true;
 455
 456         if (skb->ignore_df)
 457                 return false;
 458
 459         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                 return false;
 461
 462         return true;
 463 }
 464
 465 int ip6_forward(struct sk_buff *skb)
 466 {
 467         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 468         struct dst_entry *dst = skb_dst(skb);
 469         struct ipv6hdr *hdr = ipv6_hdr(skb);
 470         struct inet6_skb_parm *opt = IP6CB(skb);
 471         struct net *net = dev_net(dst->dev);
 472         u32 mtu;
 473
 474         if (net->ipv6.devconf_all->forwarding == 0)
 475                 goto error;
 476
 477         if (skb->pkt_type != PACKET_HOST)
 478                 goto drop;
 479
 480         if (unlikely(skb->sk))
 481                 goto drop;
 482
 483         if (skb_warn_if_lro(skb))
 484                 goto drop;
 485
 486         if (!net->ipv6.devconf_all->disable_policy &&
 487             !idev->cnf.disable_policy &&
 488             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 489                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 490                 goto drop;
 491         }
 492
 493         skb_forward_csum(skb);
 494
 495         /*
 496          *      We DO NOT make any processing on
 497          *      RA packets, pushing them to user level AS IS
 498          *      without ane WARRANTY that application will be able
 499          *      to interpret them. The reason is that we
 500          *      cannot make anything clever here.
 501          *
 502          *      We are not end-node, so that if packet contains
 503          *      AH/ESP, we cannot make anything.
 504          *      Defragmentation also would be mistake, RA packets
 505          *      cannot be fragmented, because there is no warranty
 506          *      that different fragments will go along one path. --ANK
 507          */
 508         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 509                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 510                         return 0;
 511         }
 512
 513         /*
 514          *      check and decrement ttl
 515          */
 516         if (hdr->hop_limit <= 1) {
 517                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 518                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 519
 520                 kfree_skb(skb);
 521                 return -ETIMEDOUT;
 522         }
 523
 524         /* XXX: idev->cnf.proxy_ndp? */
 525         if (net->ipv6.devconf_all->proxy_ndp &&
 526             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 527                 int proxied = ip6_forward_proxy_check(skb);
 528                 if (proxied > 0) {
 529                         hdr->hop_limit--;
 530                         return ip6_input(skb);
 531                 } else if (proxied < 0) {
 532                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 533                         goto drop;
 534                 }
 535         }
 536
 537         if (!xfrm6_route_forward(skb)) {
 538                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 539                 goto drop;
 540         }
 541         dst = skb_dst(skb);
 542
 543         /* IPv6 specs say nothing about it, but it is clear that we cannot
 544            send redirects to source routed frames.
 545            We don't send redirects to frames decapsulated from IPsec.
 546          */
 547         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 548             opt->srcrt == 0 && !skb_sec_path(skb)) {
 549                 struct in6_addr *target = NULL;
 550                 struct inet_peer *peer;
 551                 struct rt6_info *rt;
 552
 553                 /*
 554                  *      incoming and outgoing devices are the same
 555                  *      send a redirect.
 556                  */
 557
 558                 rt = (struct rt6_info *) dst;
 559                 if (rt->rt6i_flags & RTF_GATEWAY)
 560                         target = &rt->rt6i_gateway;
 561                 else
 562                         target = &hdr->daddr;
 563
 564                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 565
 566                 /* Limit redirects both by destination (here)
 567                    and by source (inside ndisc_send_redirect)
 568                  */
 569                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 570                         ndisc_send_redirect(skb, target);
 571                 if (peer)
 572                         inet_putpeer(peer);
 573         } else {
 574                 int addrtype = ipv6_addr_type(&hdr->saddr);
 575
 576                 /* This check is security critical. */
 577                 if (addrtype == IPV6_ADDR_ANY ||
 578                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 579                         goto error;
 580                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 581                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 582                                     ICMPV6_NOT_NEIGHBOUR, 0);
 583                         goto error;
 584                 }
 585         }
 586
 587         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 588         if (mtu < IPV6_MIN_MTU)
 589                 mtu = IPV6_MIN_MTU;
 590
 591         if (ip6_pkt_too_big(skb, mtu)) {
 592                 /* Again, force OUTPUT device used as source address */
 593                 skb->dev = dst->dev;
 594                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 595                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 596                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 597                                 IPSTATS_MIB_FRAGFAILS);
 598                 kfree_skb(skb);
 599                 return -EMSGSIZE;
 600         }
 601
 602         if (skb_cow(skb, dst->dev->hard_header_len)) {
 603                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 604                                 IPSTATS_MIB_OUTDISCARDS);
 605                 goto drop;
 606         }
 607
 608         hdr = ipv6_hdr(skb);
 609
 610         /* Mangling hops number delayed to point after skb COW */
 611
 612         hdr->hop_limit--;
 613
 614         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 615                        net, NULL, skb, skb->dev, dst->dev,
 616                        ip6_forward_finish);
 617
 618 error:
 619         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 620 drop:
 621         kfree_skb(skb);
 622         return -EINVAL;
 623 }
 624
 625 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 626 {
 627         to->pkt_type = from->pkt_type;
 628         to->priority = from->priority;
 629         to->protocol = from->protocol;
 630         skb_dst_drop(to);
 631         skb_dst_set(to, dst_clone(skb_dst(from)));
 632         to->dev = from->dev;
 633         to->mark = from->mark;
 634
 635         skb_copy_hash(to, from);
 636
 637 #ifdef CONFIG_NET_SCHED
 638         to->tc_index = from->tc_index;
 639 #endif
 640         nf_copy(to, from);
 641         skb_ext_copy(to, from);
 642         skb_copy_secmark(to, from);
 643 }
 644
 645 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 646                       u8 nexthdr, __be32 frag_id,
 647                       struct ip6_fraglist_iter *iter)
 648 {
 649         unsigned int first_len;
 650         struct frag_hdr *fh;
 651
 652         /* BUILD HEADER */
 653         *prevhdr = NEXTHDR_FRAGMENT;
 654         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 655         if (!iter->tmp_hdr)
 656                 return -ENOMEM;
 657
 658         iter->frag = skb_shinfo(skb)->frag_list;
 659         skb_frag_list_init(skb);
 660
 661         iter->offset = 0;
 662         iter->hlen = hlen;
 663         iter->frag_id = frag_id;
 664         iter->nexthdr = nexthdr;
 665
 666         __skb_pull(skb, hlen);
 667         fh = __skb_push(skb, sizeof(struct frag_hdr));
 668         __skb_push(skb, hlen);
 669         skb_reset_network_header(skb);
 670         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 671
 672         fh->nexthdr = nexthdr;
 673         fh->reserved = 0;
 674         fh->frag_off = htons(IP6_MF);
 675         fh->identification = frag_id;
 676
 677         first_len = skb_pagelen(skb);
 678         skb->data_len = first_len - skb_headlen(skb);
 679         skb->len = first_len;
 680         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 681
 682         return 0;
 683 }
 684 EXPORT_SYMBOL(ip6_fraglist_init);
 685
 686 void ip6_fraglist_prepare(struct sk_buff *skb,
 687                           struct ip6_fraglist_iter *iter)
 688 {
 689         struct sk_buff *frag = iter->frag;
 690         unsigned int hlen = iter->hlen;
 691         struct frag_hdr *fh;
 692
 693         frag->ip_summed = CHECKSUM_NONE;
 694         skb_reset_transport_header(frag);
 695         fh = __skb_push(frag, sizeof(struct frag_hdr));
 696         __skb_push(frag, hlen);
 697         skb_reset_network_header(frag);
 698         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 699         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 700         fh->nexthdr = iter->nexthdr;
 701         fh->reserved = 0;
 702         fh->frag_off = htons(iter->offset);
 703         if (frag->next)
 704                 fh->frag_off |= htons(IP6_MF);
 705         fh->identification = iter->frag_id;
 706         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 707         ip6_copy_metadata(frag, skb);
 708 }
 709 EXPORT_SYMBOL(ip6_fraglist_prepare);
 710
 711 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 712                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 713                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 714 {
 715         state->prevhdr = prevhdr;
 716         state->nexthdr = nexthdr;
 717         state->frag_id = frag_id;
 718
 719         state->hlen = hlen;
 720         state->mtu = mtu;
 721
 722         state->left = skb->len - hlen;  /* Space per frame */
 723         state->ptr = hlen;              /* Where to start from */
 724
 725         state->hroom = hdr_room;
 726         state->troom = needed_tailroom;
 727
 728         state->offset = 0;
 729 }
 730 EXPORT_SYMBOL(ip6_frag_init);
 731
 732 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 733 {
 734         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 735         struct sk_buff *frag;
 736         struct frag_hdr *fh;
 737         unsigned int len;
 738
 739         len = state->left;
 740         /* IF: it doesn't fit, use 'mtu' - the data space left */
 741         if (len > state->mtu)
 742                 len = state->mtu;
 743         /* IF: we are not sending up to and including the packet end
 744            then align the next start on an eight byte boundary */
 745         if (len < state->left)
 746                 len &= ~7;
 747
 748         /* Allocate buffer */
 749         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 750                          state->hroom + state->troom, GFP_ATOMIC);
 751         if (!frag)
 752                 return ERR_PTR(-ENOMEM);
 753
 754         /*
 755          *      Set up data on packet
 756          */
 757
 758         ip6_copy_metadata(frag, skb);
 759         skb_reserve(frag, state->hroom);
 760         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 761         skb_reset_network_header(frag);
 762         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 763         frag->transport_header = (frag->network_header + state->hlen +
 764                                   sizeof(struct frag_hdr));
 765
 766         /*
 767          *      Charge the memory for the fragment to any owner
 768          *      it might possess
 769          */
 770         if (skb->sk)
 771                 skb_set_owner_w(frag, skb->sk);
 772
 773         /*
 774          *      Copy the packet header into the new buffer.
 775          */
 776         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 777
 778         fragnexthdr_offset = skb_network_header(frag);
 779         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 780         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 781
 782         /*
 783          *      Build fragment header.
 784          */
 785         fh->nexthdr = state->nexthdr;
 786         fh->reserved = 0;
 787         fh->identification = state->frag_id;
 788
 789         /*
 790          *      Copy a block of the IP datagram.
 791          */
 792         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 793                              len));
 794         state->left -= len;
 795
 796         fh->frag_off = htons(state->offset);
 797         if (state->left > 0)
 798                 fh->frag_off |= htons(IP6_MF);
 799         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 800
 801         state->ptr += len;
 802         state->offset += len;
 803
 804         return frag;
 805 }
 806 EXPORT_SYMBOL(ip6_frag_next);
 807
 808 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 809                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 810 {
 811         struct sk_buff *frag;
 812         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 813         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 814                                 inet6_sk(skb->sk) : NULL;
 815         struct ip6_frag_state state;
 816         unsigned int mtu, hlen, nexthdr_offset;
 817         ktime_t tstamp = skb->tstamp;
 818         int hroom, err = 0;
 819         __be32 frag_id;
 820         u8 *prevhdr, nexthdr = 0;
 821
 822         err = ip6_find_1stfragopt(skb, &prevhdr);
 823         if (err < 0)
 824                 goto fail;
 825         hlen = err;
 826         nexthdr = *prevhdr;
 827         nexthdr_offset = prevhdr - skb_network_header(skb);
 828
 829         mtu = ip6_skb_dst_mtu(skb);
 830
 831         /* We must not fragment if the socket is set to force MTU discovery
 832          * or if the skb it not generated by a local socket.
 833          */
 834         if (unlikely(!skb->ignore_df && skb->len > mtu))
 835                 goto fail_toobig;
 836
 837         if (IP6CB(skb)->frag_max_size) {
 838                 if (IP6CB(skb)->frag_max_size > mtu)
 839                         goto fail_toobig;
 840
 841                 /* don't send fragments larger than what we received */
 842                 mtu = IP6CB(skb)->frag_max_size;
 843                 if (mtu < IPV6_MIN_MTU)
 844                         mtu = IPV6_MIN_MTU;
 845         }
 846
 847         if (np && np->frag_size < mtu) {
 848                 if (np->frag_size)
 849                         mtu = np->frag_size;
 850         }
 851         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 852                 goto fail_toobig;
 853         mtu -= hlen + sizeof(struct frag_hdr);
 854
 855         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 856                                     &ipv6_hdr(skb)->saddr);
 857
 858         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 859             (err = skb_checksum_help(skb)))
 860                 goto fail;
 861
 862         prevhdr = skb_network_header(skb) + nexthdr_offset;
 863         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 864         if (skb_has_frag_list(skb)) {
 865                 unsigned int first_len = skb_pagelen(skb);
 866                 struct ip6_fraglist_iter iter;
 867                 struct sk_buff *frag2;
 868
 869                 if (first_len - hlen > mtu ||
 870                     ((first_len - hlen) & 7) ||
 871                     skb_cloned(skb) ||
 872                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 873                         goto slow_path;
 874
 875                 skb_walk_frags(skb, frag) {
 876                         /* Correct geometry. */
 877                         if (frag->len > mtu ||
 878                             ((frag->len & 7) && frag->next) ||
 879                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 880                                 goto slow_path_clean;
 881
 882                         /* Partially cloned skb? */
 883                         if (skb_shared(frag))
 884                                 goto slow_path_clean;
 885
 886                         BUG_ON(frag->sk);
 887                         if (skb->sk) {
 888                                 frag->sk = skb->sk;
 889                                 frag->destructor = sock_wfree;
 890                         }
 891                         skb->truesize -= frag->truesize;
 892                 }
 893
 894                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 895                                         &iter);
 896                 if (err < 0)
 897                         goto fail;
 898
 899                 for (;;) {
 900                         /* Prepare header of the next frame,
 901                          * before previous one went down. */
 902                         if (iter.frag)
 903                                 ip6_fraglist_prepare(skb, &iter);
 904
 905                         skb->tstamp = tstamp;
 906                         err = output(net, sk, skb);
 907                         if (!err)
 908                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 909                                               IPSTATS_MIB_FRAGCREATES);
 910
 911                         if (err || !iter.frag)
 912                                 break;
 913
 914                         skb = ip6_fraglist_next(&iter);
 915                 }
 916
 917                 kfree(iter.tmp_hdr);
 918
 919                 if (err == 0) {
 920                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 921                                       IPSTATS_MIB_FRAGOKS);
 922                         return 0;
 923                 }
 924
 925                 kfree_skb_list(iter.frag);
 926
 927                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 928                               IPSTATS_MIB_FRAGFAILS);
 929                 return err;
 930
 931 slow_path_clean:
 932                 skb_walk_frags(skb, frag2) {
 933                         if (frag2 == frag)
 934                                 break;
 935                         frag2->sk = NULL;
 936                         frag2->destructor = NULL;
 937                         skb->truesize += frag2->truesize;
 938                 }
 939         }
 940
 941 slow_path:
 942         /*
 943          *      Fragment the datagram.
 944          */
 945
 946         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 947                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 948                       &state);
 949
 950         /*
 951          *      Keep copying data until we run out.
 952          */
 953
 954         while (state.left > 0) {
 955                 frag = ip6_frag_next(skb, &state);
 956                 if (IS_ERR(frag)) {
 957                         err = PTR_ERR(frag);
 958                         goto fail;
 959                 }
 960
 961                 /*
 962                  *      Put this fragment into the sending queue.
 963                  */
 964                 frag->tstamp = tstamp;
 965                 err = output(net, sk, frag);
 966                 if (err)
 967                         goto fail;
 968
 969                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 970                               IPSTATS_MIB_FRAGCREATES);
 971         }
 972         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 973                       IPSTATS_MIB_FRAGOKS);
 974         consume_skb(skb);
 975         return err;
 976
 977 fail_toobig:
 978         if (skb->sk && dst_allfrag(skb_dst(skb)))
 979                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 980
 981         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 982         err = -EMSGSIZE;
 983
 984 fail:
 985         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 986                       IPSTATS_MIB_FRAGFAILS);
 987         kfree_skb(skb);
 988         return err;
 989 }
 990
 991 static inline int ip6_rt_check(const struct rt6key *rt_key,
 992                                const struct in6_addr *fl_addr,
 993                                const struct in6_addr *addr_cache)
 994 {
 995         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 996                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 997 }
 998
 999 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1000                                           struct dst_entry *dst,
1001                                           const struct flowi6 *fl6)
1002 {
1003         struct ipv6_pinfo *np = inet6_sk(sk);
1004         struct rt6_info *rt;
1005
1006         if (!dst)
1007                 goto out;
1008
1009         if (dst->ops->family != AF_INET6) {
1010                 dst_release(dst);
1011                 return NULL;
1012         }
1013
1014         rt = (struct rt6_info *)dst;
1015         /* Yes, checking route validity in not connected
1016          * case is not very simple. Take into account,
1017          * that we do not support routing by source, TOS,
1018          * and MSG_DONTROUTE            --ANK (980726)
1019          *
1020          * 1. ip6_rt_check(): If route was host route,
1021          *    check that cached destination is current.
1022          *    If it is network route, we still may
1023          *    check its validity using saved pointer
1024          *    to the last used address: daddr_cache.
1025          *    We do not want to save whole address now,
1026          *    (because main consumer of this service
1027          *    is tcp, which has not this problem),
1028          *    so that the last trick works only on connected
1029          *    sockets.
1030          * 2. oif also should be the same.
1031          */
1032         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1033 #ifdef CONFIG_IPV6_SUBTREES
1034             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1035 #endif
1036            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1037               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1038                 dst_release(dst);
1039                 dst = NULL;
1040         }
1041
1042 out:
1043         return dst;
1044 }
1045
1046 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1047                                struct dst_entry **dst, struct flowi6 *fl6)
1048 {
1049 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1050         struct neighbour *n;
1051         struct rt6_info *rt;
1052 #endif
1053         int err;
1054         int flags = 0;
1055
1056         /* The correct way to handle this would be to do
1057          * ip6_route_get_saddr, and then ip6_route_output; however,
1058          * the route-specific preferred source forces the
1059          * ip6_route_output call _before_ ip6_route_get_saddr.
1060          *
1061          * In source specific routing (no src=any default route),
1062          * ip6_route_output will fail given src=any saddr, though, so
1063          * that's why we try it again later.
1064          */
1065         if (ipv6_addr_any(&fl6->saddr)) {
1066                 struct fib6_info *from;
1067                 struct rt6_info *rt;
1068
1069                 *dst = ip6_route_output(net, sk, fl6);
1070                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1071
1072                 rcu_read_lock();
1073                 from = rt ? rcu_dereference(rt->from) : NULL;
1074                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1075                                           sk ? inet6_sk(sk)->srcprefs : 0,
1076                                           &fl6->saddr);
1077                 rcu_read_unlock();
1078
1079                 if (err)
1080                         goto out_err_release;
1081
1082                 /* If we had an erroneous initial result, pretend it
1083                  * never existed and let the SA-enabled version take
1084                  * over.
1085                  */
1086                 if ((*dst)->error) {
1087                         dst_release(*dst);
1088                         *dst = NULL;
1089                 }
1090
1091                 if (fl6->flowi6_oif)
1092                         flags |= RT6_LOOKUP_F_IFACE;
1093         }
1094
1095         if (!*dst)
1096                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1097
1098         err = (*dst)->error;
1099         if (err)
1100                 goto out_err_release;
1101
1102 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1103         /*
1104          * Here if the dst entry we've looked up
1105          * has a neighbour entry that is in the INCOMPLETE
1106          * state and the src address from the flow is
1107          * marked as OPTIMISTIC, we release the found
1108          * dst entry and replace it instead with the
1109          * dst entry of the nexthop router
1110          */
1111         rt = (struct rt6_info *) *dst;
1112         rcu_read_lock_bh();
1113         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1114                                       rt6_nexthop(rt, &fl6->daddr));
1115         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1116         rcu_read_unlock_bh();
1117
1118         if (err) {
1119                 struct inet6_ifaddr *ifp;
1120                 struct flowi6 fl_gw6;
1121                 int redirect;
1122
1123                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1124                                       (*dst)->dev, 1);
1125
1126                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1127                 if (ifp)
1128                         in6_ifa_put(ifp);
1129
1130                 if (redirect) {
1131                         /*
1132                          * We need to get the dst entry for the
1133                          * default router instead
1134                          */
1135                         dst_release(*dst);
1136                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1137                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1138                         *dst = ip6_route_output(net, sk, &fl_gw6);
1139                         err = (*dst)->error;
1140                         if (err)
1141                                 goto out_err_release;
1142                 }
1143         }
1144 #endif
1145         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1146             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1147                 err = -EAFNOSUPPORT;
1148                 goto out_err_release;
1149         }
1150
1151         return 0;
1152
1153 out_err_release:
1154         dst_release(*dst);
1155         *dst = NULL;
1156
1157         if (err == -ENETUNREACH)
1158                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1159         return err;
1160 }
1161
1162 /**
1163  *      ip6_dst_lookup - perform route lookup on flow
1164  *      @net: Network namespace to perform lookup in
1165  *      @sk: socket which provides route info
1166  *      @dst: pointer to dst_entry * for result
1167  *      @fl6: flow to lookup
1168  *
1169  *      This function performs a route lookup on the given flow.
1170  *
1171  *      It returns zero on success, or a standard errno code on error.
1172  */
1173 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1174                    struct flowi6 *fl6)
1175 {
1176         *dst = NULL;
1177         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1178 }
1179 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1180
1181 /**
1182  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1183  *      @net: Network namespace to perform lookup in
1184  *      @sk: socket which provides route info
1185  *      @fl6: flow to lookup
1186  *      @final_dst: final destination address for ipsec lookup
1187  *
1188  *      This function performs a route lookup on the given flow.
1189  *
1190  *      It returns a valid dst pointer on success, or a pointer encoded
1191  *      error code.
1192  */
1193 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1194                                       const struct in6_addr *final_dst)
1195 {
1196         struct dst_entry *dst = NULL;
1197         int err;
1198
1199         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1200         if (err)
1201                 return ERR_PTR(err);
1202         if (final_dst)
1203                 fl6->daddr = *final_dst;
1204
1205         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1206 }
1207 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1208
1209 /**
1210  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1211  *      @sk: socket which provides the dst cache and route info
1212  *      @fl6: flow to lookup
1213  *      @final_dst: final destination address for ipsec lookup
1214  *      @connected: whether @sk is connected or not
1215  *
1216  *      This function performs a route lookup on the given flow with the
1217  *      possibility of using the cached route in the socket if it is valid.
1218  *      It will take the socket dst lock when operating on the dst cache.
1219  *      As a result, this function can only be used in process context.
1220  *
1221  *      In addition, for a connected socket, cache the dst in the socket
1222  *      if the current cache is not valid.
1223  *
1224  *      It returns a valid dst pointer on success, or a pointer encoded
1225  *      error code.
1226  */
1227 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1228                                          const struct in6_addr *final_dst,
1229                                          bool connected)
1230 {
1231         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1232
1233         dst = ip6_sk_dst_check(sk, dst, fl6);
1234         if (dst)
1235                 return dst;
1236
1237         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1238         if (connected && !IS_ERR(dst))
1239                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1240
1241         return dst;
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1244
1245 /**
1246  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1247  *      @skb: Packet for which lookup is done
1248  *      @dev: Tunnel device
1249  *      @net: Network namespace of tunnel device
1250  *      @sock: Socket which provides route info
1251  *      @saddr: Memory to store the src ip address
1252  *      @info: Tunnel information
1253  *      @protocol: IP protocol
1254  *      @use_cache: Flag to enable cache usage
1255  *      This function performs a route lookup on a tunnel
1256  *
1257  *      It returns a valid dst pointer and stores src address to be used in
1258  *      tunnel in param saddr on success, else a pointer encoded error code.
1259  */
1260
1261 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1262                                         struct net_device *dev,
1263                                         struct net *net,
1264                                         struct socket *sock,
1265                                         struct in6_addr *saddr,
1266                                         const struct ip_tunnel_info *info,
1267                                         u8 protocol,
1268                                         bool use_cache)
1269 {
1270         struct dst_entry *dst = NULL;
1271 #ifdef CONFIG_DST_CACHE
1272         struct dst_cache *dst_cache;
1273 #endif
1274         struct flowi6 fl6;
1275         __u8 prio;
1276
1277 #ifdef CONFIG_DST_CACHE
1278         dst_cache = (struct dst_cache *)&info->dst_cache;
1279         if (use_cache) {
1280                 dst = dst_cache_get_ip6(dst_cache, saddr);
1281                 if (dst)
1282                         return dst;
1283         }
1284 #endif
1285         memset(&fl6, 0, sizeof(fl6));
1286         fl6.flowi6_mark = skb->mark;
1287         fl6.flowi6_proto = protocol;
1288         fl6.daddr = info->key.u.ipv6.dst;
1289         fl6.saddr = info->key.u.ipv6.src;
1290         prio = info->key.tos;
1291         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1292                                           info->key.label);
1293
1294         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1295                                               NULL);
1296         if (IS_ERR(dst)) {
1297                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1298                 return ERR_PTR(-ENETUNREACH);
1299         }
1300         if (dst->dev == dev) { /* is this necessary? */
1301                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1302                 dst_release(dst);
1303                 return ERR_PTR(-ELOOP);
1304         }
1305 #ifdef CONFIG_DST_CACHE
1306         if (use_cache)
1307                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1308 #endif
1309         *saddr = fl6.saddr;
1310         return dst;
1311 }
1312 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1313
1314 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1315                                                gfp_t gfp)
1316 {
1317         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1318 }
1319
1320 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1321                                                 gfp_t gfp)
1322 {
1323         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1324 }
1325
1326 static void ip6_append_data_mtu(unsigned int *mtu,
1327                                 int *maxfraglen,
1328                                 unsigned int fragheaderlen,
1329                                 struct sk_buff *skb,
1330                                 struct rt6_info *rt,
1331                                 unsigned int orig_mtu)
1332 {
1333         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1334                 if (!skb) {
1335                         /* first fragment, reserve header_len */
1336                         *mtu = orig_mtu - rt->dst.header_len;
1337
1338                 } else {
1339                         /*
1340                          * this fragment is not first, the headers
1341                          * space is regarded as data space.
1342                          */
1343                         *mtu = orig_mtu;
1344                 }
1345                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1346                               + fragheaderlen - sizeof(struct frag_hdr);
1347         }
1348 }
1349
1350 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1351                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1352                           struct rt6_info *rt, struct flowi6 *fl6)
1353 {
1354         struct ipv6_pinfo *np = inet6_sk(sk);
1355         unsigned int mtu;
1356         struct ipv6_txoptions *opt = ipc6->opt;
1357
1358         /*
1359          * setup for corking
1360          */
1361         if (opt) {
1362                 if (WARN_ON(v6_cork->opt))
1363                         return -EINVAL;
1364
1365                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1366                 if (unlikely(!v6_cork->opt))
1367                         return -ENOBUFS;
1368
1369                 v6_cork->opt->tot_len = sizeof(*opt);
1370                 v6_cork->opt->opt_flen = opt->opt_flen;
1371                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1372
1373                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1374                                                     sk->sk_allocation);
1375                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1376                         return -ENOBUFS;
1377
1378                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1379                                                     sk->sk_allocation);
1380                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1381                         return -ENOBUFS;
1382
1383                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1384                                                    sk->sk_allocation);
1385                 if (opt->hopopt && !v6_cork->opt->hopopt)
1386                         return -ENOBUFS;
1387
1388                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1389                                                     sk->sk_allocation);
1390                 if (opt->srcrt && !v6_cork->opt->srcrt)
1391                         return -ENOBUFS;
1392
1393                 /* need source address above miyazawa*/
1394         }
1395         dst_hold(&rt->dst);
1396         cork->base.dst = &rt->dst;
1397         cork->fl.u.ip6 = *fl6;
1398         v6_cork->hop_limit = ipc6->hlimit;
1399         v6_cork->tclass = ipc6->tclass;
1400         if (rt->dst.flags & DST_XFRM_TUNNEL)
1401                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1402                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1403         else
1404                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1405                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1406         if (np->frag_size < mtu) {
1407                 if (np->frag_size)
1408                         mtu = np->frag_size;
1409         }
1410         if (mtu < IPV6_MIN_MTU)
1411                 return -EINVAL;
1412         cork->base.fragsize = mtu;
1413         cork->base.gso_size = ipc6->gso_size;
1414         cork->base.tx_flags = 0;
1415         cork->base.mark = ipc6->sockc.mark;
1416         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1417
1418         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1419                 cork->base.flags |= IPCORK_ALLFRAG;
1420         cork->base.length = 0;
1421
1422         cork->base.transmit_time = ipc6->sockc.transmit_time;
1423
1424         return 0;
1425 }
1426
1427 static int __ip6_append_data(struct sock *sk,
1428                              struct flowi6 *fl6,
1429                              struct sk_buff_head *queue,
1430                              struct inet_cork *cork,
1431                              struct inet6_cork *v6_cork,
1432                              struct page_frag *pfrag,
1433                              int getfrag(void *from, char *to, int offset,
1434                                          int len, int odd, struct sk_buff *skb),
1435                              void *from, int length, int transhdrlen,
1436                              unsigned int flags, struct ipcm6_cookie *ipc6)
1437 {
1438         struct sk_buff *skb, *skb_prev = NULL;
1439         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1440         struct ubuf_info *uarg = NULL;
1441         int exthdrlen = 0;
1442         int dst_exthdrlen = 0;
1443         int hh_len;
1444         int copy;
1445         int err;
1446         int offset = 0;
1447         u32 tskey = 0;
1448         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1449         struct ipv6_txoptions *opt = v6_cork->opt;
1450         int csummode = CHECKSUM_NONE;
1451         unsigned int maxnonfragsize, headersize;
1452         unsigned int wmem_alloc_delta = 0;
1453         bool paged, extra_uref = false;
1454
1455         skb = skb_peek_tail(queue);
1456         if (!skb) {
1457                 exthdrlen = opt ? opt->opt_flen : 0;
1458                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1459         }
1460
1461         paged = !!cork->gso_size;
1462         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1463         orig_mtu = mtu;
1464
1465         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1466             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1467                 tskey = sk->sk_tskey++;
1468
1469         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1470
1471         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1472                         (opt ? opt->opt_nflen : 0);
1473         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1474                      sizeof(struct frag_hdr);
1475
1476         headersize = sizeof(struct ipv6hdr) +
1477                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1478                      (dst_allfrag(&rt->dst) ?
1479                       sizeof(struct frag_hdr) : 0) +
1480                      rt->rt6i_nfheader_len;
1481
1482         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1483          * the first fragment
1484          */
1485         if (headersize + transhdrlen > mtu)
1486                 goto emsgsize;
1487
1488         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1489             (sk->sk_protocol == IPPROTO_UDP ||
1490              sk->sk_protocol == IPPROTO_RAW)) {
1491                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1492                                 sizeof(struct ipv6hdr));
1493                 goto emsgsize;
1494         }
1495
1496         if (ip6_sk_ignore_df(sk))
1497                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1498         else
1499                 maxnonfragsize = mtu;
1500
1501         if (cork->length + length > maxnonfragsize - headersize) {
1502 emsgsize:
1503                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1504                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1505                 return -EMSGSIZE;
1506         }
1507
1508         /* CHECKSUM_PARTIAL only with no extension headers and when
1509          * we are not going to fragment
1510          */
1511         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1512             headersize == sizeof(struct ipv6hdr) &&
1513             length <= mtu - headersize &&
1514             (!(flags & MSG_MORE) || cork->gso_size) &&
1515             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1516                 csummode = CHECKSUM_PARTIAL;
1517
1518         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1519                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1520                 if (!uarg)
1521                         return -ENOBUFS;
1522                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1523                 if (rt->dst.dev->features & NETIF_F_SG &&
1524                     csummode == CHECKSUM_PARTIAL) {
1525                         paged = true;
1526                 } else {
1527                         uarg->zerocopy = 0;
1528                         skb_zcopy_set(skb, uarg, &extra_uref);
1529                 }
1530         }
1531
1532         /*
1533          * Let's try using as much space as possible.
1534          * Use MTU if total length of the message fits into the MTU.
1535          * Otherwise, we need to reserve fragment header and
1536          * fragment alignment (= 8-15 octects, in total).
1537          *
1538          * Note that we may need to "move" the data from the tail
1539          * of the buffer to the new fragment when we split
1540          * the message.
1541          *
1542          * FIXME: It may be fragmented into multiple chunks
1543          *        at once if non-fragmentable extension headers
1544          *        are too large.
1545          * --yoshfuji
1546          */
1547
1548         cork->length += length;
1549         if (!skb)
1550                 goto alloc_new_skb;
1551
1552         while (length > 0) {
1553                 /* Check if the remaining data fits into current packet. */
1554                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1555                 if (copy < length)
1556                         copy = maxfraglen - skb->len;
1557
1558                 if (copy <= 0) {
1559                         char *data;
1560                         unsigned int datalen;
1561                         unsigned int fraglen;
1562                         unsigned int fraggap;
1563                         unsigned int alloclen, alloc_extra;
1564                         unsigned int pagedlen;
1565 alloc_new_skb:
1566                         /* There's no room in the current skb */
1567                         if (skb)
1568                                 fraggap = skb->len - maxfraglen;
1569                         else
1570                                 fraggap = 0;
1571                         /* update mtu and maxfraglen if necessary */
1572                         if (!skb || !skb_prev)
1573                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1574                                                     fragheaderlen, skb, rt,
1575                                                     orig_mtu);
1576
1577                         skb_prev = skb;
1578
1579                         /*
1580                          * If remaining data exceeds the mtu,
1581                          * we know we need more fragment(s).
1582                          */
1583                         datalen = length + fraggap;
1584
1585                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1586                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1587                         fraglen = datalen + fragheaderlen;
1588                         pagedlen = 0;
1589
1590                         alloc_extra = hh_len;
1591                         alloc_extra += dst_exthdrlen;
1592                         alloc_extra += rt->dst.trailer_len;
1593
1594                         /* We just reserve space for fragment header.
1595                          * Note: this may be overallocation if the message
1596                          * (without MSG_MORE) fits into the MTU.
1597                          */
1598                         alloc_extra += sizeof(struct frag_hdr);
1599
1600                         if ((flags & MSG_MORE) &&
1601                             !(rt->dst.dev->features&NETIF_F_SG))
1602                                 alloclen = mtu;
1603                         else if (!paged &&
1604                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1605                                   !(rt->dst.dev->features & NETIF_F_SG)))
1606                                 alloclen = fraglen;
1607                         else {
1608                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1609                                 pagedlen = fraglen - alloclen;
1610                         }
1611                         alloclen += alloc_extra;
1612
1613                         if (datalen != length + fraggap) {
1614                                 /*
1615                                  * this is not the last fragment, the trailer
1616                                  * space is regarded as data space.
1617                                  */
1618                                 datalen += rt->dst.trailer_len;
1619                         }
1620
1621                         fraglen = datalen + fragheaderlen;
1622
1623                         copy = datalen - transhdrlen - fraggap - pagedlen;
1624                         if (copy < 0) {
1625                                 err = -EINVAL;
1626                                 goto error;
1627                         }
1628                         if (transhdrlen) {
1629                                 skb = sock_alloc_send_skb(sk, alloclen,
1630                                                 (flags & MSG_DONTWAIT), &err);
1631                         } else {
1632                                 skb = NULL;
1633                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1634                                     2 * sk->sk_sndbuf)
1635                                         skb = alloc_skb(alloclen,
1636                                                         sk->sk_allocation);
1637                                 if (unlikely(!skb))
1638                                         err = -ENOBUFS;
1639                         }
1640                         if (!skb)
1641                                 goto error;
1642                         /*
1643                          *      Fill in the control structures
1644                          */
1645                         skb->protocol = htons(ETH_P_IPV6);
1646                         skb->ip_summed = csummode;
1647                         skb->csum = 0;
1648                         /* reserve for fragmentation and ipsec header */
1649                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1650                                     dst_exthdrlen);
1651
1652                         /*
1653                          *      Find where to start putting bytes
1654                          */
1655                         data = skb_put(skb, fraglen - pagedlen);
1656                         skb_set_network_header(skb, exthdrlen);
1657                         data += fragheaderlen;
1658                         skb->transport_header = (skb->network_header +
1659                                                  fragheaderlen);
1660                         if (fraggap) {
1661                                 skb->csum = skb_copy_and_csum_bits(
1662                                         skb_prev, maxfraglen,
1663                                         data + transhdrlen, fraggap);
1664                                 skb_prev->csum = csum_sub(skb_prev->csum,
1665                                                           skb->csum);
1666                                 data += fraggap;
1667                                 pskb_trim_unique(skb_prev, maxfraglen);
1668                         }
1669                         if (copy > 0 &&
1670                             getfrag(from, data + transhdrlen, offset,
1671                                     copy, fraggap, skb) < 0) {
1672                                 err = -EFAULT;
1673                                 kfree_skb(skb);
1674                                 goto error;
1675                         }
1676
1677                         offset += copy;
1678                         length -= copy + transhdrlen;
1679                         transhdrlen = 0;
1680                         exthdrlen = 0;
1681                         dst_exthdrlen = 0;
1682
1683                         /* Only the initial fragment is time stamped */
1684                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1685                         cork->tx_flags = 0;
1686                         skb_shinfo(skb)->tskey = tskey;
1687                         tskey = 0;
1688                         skb_zcopy_set(skb, uarg, &extra_uref);
1689
1690                         if ((flags & MSG_CONFIRM) && !skb_prev)
1691                                 skb_set_dst_pending_confirm(skb, 1);
1692
1693                         /*
1694                          * Put the packet on the pending queue
1695                          */
1696                         if (!skb->destructor) {
1697                                 skb->destructor = sock_wfree;
1698                                 skb->sk = sk;
1699                                 wmem_alloc_delta += skb->truesize;
1700                         }
1701                         __skb_queue_tail(queue, skb);
1702                         continue;
1703                 }
1704
1705                 if (copy > length)
1706                         copy = length;
1707
1708                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1709                     skb_tailroom(skb) >= copy) {
1710                         unsigned int off;
1711
1712                         off = skb->len;
1713                         if (getfrag(from, skb_put(skb, copy),
1714                                                 offset, copy, off, skb) < 0) {
1715                                 __skb_trim(skb, off);
1716                                 err = -EFAULT;
1717                                 goto error;
1718                         }
1719                 } else if (!uarg || !uarg->zerocopy) {
1720                         int i = skb_shinfo(skb)->nr_frags;
1721
1722                         err = -ENOMEM;
1723                         if (!sk_page_frag_refill(sk, pfrag))
1724                                 goto error;
1725
1726                         if (!skb_can_coalesce(skb, i, pfrag->page,
1727                                               pfrag->offset)) {
1728                                 err = -EMSGSIZE;
1729                                 if (i == MAX_SKB_FRAGS)
1730                                         goto error;
1731
1732                                 __skb_fill_page_desc(skb, i, pfrag->page,
1733                                                      pfrag->offset, 0);
1734                                 skb_shinfo(skb)->nr_frags = ++i;
1735                                 get_page(pfrag->page);
1736                         }
1737                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1738                         if (getfrag(from,
1739                                     page_address(pfrag->page) + pfrag->offset,
1740                                     offset, copy, skb->len, skb) < 0)
1741                                 goto error_efault;
1742
1743                         pfrag->offset += copy;
1744                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1745                         skb->len += copy;
1746                         skb->data_len += copy;
1747                         skb->truesize += copy;
1748                         wmem_alloc_delta += copy;
1749                 } else {
1750                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1751                         if (err < 0)
1752                                 goto error;
1753                 }
1754                 offset += copy;
1755                 length -= copy;
1756         }
1757
1758         if (wmem_alloc_delta)
1759                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1760         return 0;
1761
1762 error_efault:
1763         err = -EFAULT;
1764 error:
1765         net_zcopy_put_abort(uarg, extra_uref);
1766         cork->length -= length;
1767         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1768         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1769         return err;
1770 }
1771
1772 int ip6_append_data(struct sock *sk,
1773                     int getfrag(void *from, char *to, int offset, int len,
1774                                 int odd, struct sk_buff *skb),
1775                     void *from, int length, int transhdrlen,
1776                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1777                     struct rt6_info *rt, unsigned int flags)
1778 {
1779         struct inet_sock *inet = inet_sk(sk);
1780         struct ipv6_pinfo *np = inet6_sk(sk);
1781         int exthdrlen;
1782         int err;
1783
1784         if (flags&MSG_PROBE)
1785                 return 0;
1786         if (skb_queue_empty(&sk->sk_write_queue)) {
1787                 /*
1788                  * setup for corking
1789                  */
1790                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1791                                      ipc6, rt, fl6);
1792                 if (err)
1793                         return err;
1794
1795                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1796                 length += exthdrlen;
1797                 transhdrlen += exthdrlen;
1798         } else {
1799                 fl6 = &inet->cork.fl.u.ip6;
1800                 transhdrlen = 0;
1801         }
1802
1803         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1804                                  &np->cork, sk_page_frag(sk), getfrag,
1805                                  from, length, transhdrlen, flags, ipc6);
1806 }
1807 EXPORT_SYMBOL_GPL(ip6_append_data);
1808
1809 static void ip6_cork_release(struct inet_cork_full *cork,
1810                              struct inet6_cork *v6_cork)
1811 {
1812         if (v6_cork->opt) {
1813                 kfree(v6_cork->opt->dst0opt);
1814                 kfree(v6_cork->opt->dst1opt);
1815                 kfree(v6_cork->opt->hopopt);
1816                 kfree(v6_cork->opt->srcrt);
1817                 kfree(v6_cork->opt);
1818                 v6_cork->opt = NULL;
1819         }
1820
1821         if (cork->base.dst) {
1822                 dst_release(cork->base.dst);
1823                 cork->base.dst = NULL;
1824                 cork->base.flags &= ~IPCORK_ALLFRAG;
1825         }
1826         memset(&cork->fl, 0, sizeof(cork->fl));
1827 }
1828
1829 struct sk_buff *__ip6_make_skb(struct sock *sk,
1830                                struct sk_buff_head *queue,
1831                                struct inet_cork_full *cork,
1832                                struct inet6_cork *v6_cork)
1833 {
1834         struct sk_buff *skb, *tmp_skb;
1835         struct sk_buff **tail_skb;
1836         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1837         struct ipv6_pinfo *np = inet6_sk(sk);
1838         struct net *net = sock_net(sk);
1839         struct ipv6hdr *hdr;
1840         struct ipv6_txoptions *opt = v6_cork->opt;
1841         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1842         struct flowi6 *fl6 = &cork->fl.u.ip6;
1843         unsigned char proto = fl6->flowi6_proto;
1844
1845         skb = __skb_dequeue(queue);
1846         if (!skb)
1847                 goto out;
1848         tail_skb = &(skb_shinfo(skb)->frag_list);
1849
1850         /* move skb->data to ip header from ext header */
1851         if (skb->data < skb_network_header(skb))
1852                 __skb_pull(skb, skb_network_offset(skb));
1853         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1854                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1855                 *tail_skb = tmp_skb;
1856                 tail_skb = &(tmp_skb->next);
1857                 skb->len += tmp_skb->len;
1858                 skb->data_len += tmp_skb->len;
1859                 skb->truesize += tmp_skb->truesize;
1860                 tmp_skb->destructor = NULL;
1861                 tmp_skb->sk = NULL;
1862         }
1863
1864         /* Allow local fragmentation. */
1865         skb->ignore_df = ip6_sk_ignore_df(sk);
1866
1867         *final_dst = fl6->daddr;
1868         __skb_pull(skb, skb_network_header_len(skb));
1869         if (opt && opt->opt_flen)
1870                 ipv6_push_frag_opts(skb, opt, &proto);
1871         if (opt && opt->opt_nflen)
1872                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1873
1874         skb_push(skb, sizeof(struct ipv6hdr));
1875         skb_reset_network_header(skb);
1876         hdr = ipv6_hdr(skb);
1877
1878         ip6_flow_hdr(hdr, v6_cork->tclass,
1879                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1880                                         ip6_autoflowlabel(net, np), fl6));
1881         hdr->hop_limit = v6_cork->hop_limit;
1882         hdr->nexthdr = proto;
1883         hdr->saddr = fl6->saddr;
1884         hdr->daddr = *final_dst;
1885
1886         skb->priority = sk->sk_priority;
1887         skb->mark = cork->base.mark;
1888
1889         skb->tstamp = cork->base.transmit_time;
1890
1891         skb_dst_set(skb, dst_clone(&rt->dst));
1892         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1893         if (proto == IPPROTO_ICMPV6) {
1894                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1895
1896                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1897                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1898         }
1899
1900         ip6_cork_release(cork, v6_cork);
1901 out:
1902         return skb;
1903 }
1904
1905 int ip6_send_skb(struct sk_buff *skb)
1906 {
1907         struct net *net = sock_net(skb->sk);
1908         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1909         int err;
1910
1911         err = ip6_local_out(net, skb->sk, skb);
1912         if (err) {
1913                 if (err > 0)
1914                         err = net_xmit_errno(err);
1915                 if (err)
1916                         IP6_INC_STATS(net, rt->rt6i_idev,
1917                                       IPSTATS_MIB_OUTDISCARDS);
1918         }
1919
1920         return err;
1921 }
1922
1923 int ip6_push_pending_frames(struct sock *sk)
1924 {
1925         struct sk_buff *skb;
1926
1927         skb = ip6_finish_skb(sk);
1928         if (!skb)
1929                 return 0;
1930
1931         return ip6_send_skb(skb);
1932 }
1933 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1934
1935 static void __ip6_flush_pending_frames(struct sock *sk,
1936                                        struct sk_buff_head *queue,
1937                                        struct inet_cork_full *cork,
1938                                        struct inet6_cork *v6_cork)
1939 {
1940         struct sk_buff *skb;
1941
1942         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1943                 if (skb_dst(skb))
1944                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1945                                       IPSTATS_MIB_OUTDISCARDS);
1946                 kfree_skb(skb);
1947         }
1948
1949         ip6_cork_release(cork, v6_cork);
1950 }
1951
1952 void ip6_flush_pending_frames(struct sock *sk)
1953 {
1954         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1955                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1956 }
1957 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1958
1959 struct sk_buff *ip6_make_skb(struct sock *sk,
1960                              int getfrag(void *from, char *to, int offset,
1961                                          int len, int odd, struct sk_buff *skb),
1962                              void *from, int length, int transhdrlen,
1963                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1964                              struct rt6_info *rt, unsigned int flags,
1965                              struct inet_cork_full *cork)
1966 {
1967         struct inet6_cork v6_cork;
1968         struct sk_buff_head queue;
1969         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1970         int err;
1971
1972         if (flags & MSG_PROBE)
1973                 return NULL;
1974
1975         __skb_queue_head_init(&queue);
1976
1977         cork->base.flags = 0;
1978         cork->base.addr = 0;
1979         cork->base.opt = NULL;
1980         cork->base.dst = NULL;
1981         v6_cork.opt = NULL;
1982         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1983         if (err) {
1984                 ip6_cork_release(cork, &v6_cork);
1985                 return ERR_PTR(err);
1986         }
1987         if (ipc6->dontfrag < 0)
1988                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1989
1990         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1991                                 &current->task_frag, getfrag, from,
1992                                 length + exthdrlen, transhdrlen + exthdrlen,
1993                                 flags, ipc6);
1994         if (err) {
1995                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1996                 return ERR_PTR(err);
1997         }
1998
1999         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2000 }