net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct inet6_dev *idev = ip6_dst_idev(dst);
  64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65         const struct in6_addr *daddr, *nexthop;
  66         struct ipv6hdr *hdr;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         /* Be paranoid, rather than too clever. */
  71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                 skb = skb_expand_head(skb, hh_len);
  73                 if (!skb) {
  74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                         return -ENOMEM;
  76                 }
  77         }
  78
  79         hdr = ipv6_hdr(skb);
  80         daddr = &hdr->daddr;
  81         if (ipv6_addr_is_multicast(daddr)) {
  82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                     ((mroute6_is_socket(net, skb) &&
  84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                         /* Do not check for IFF_ALLMULTI; multicast routing
  89                            is not supported in any case.
  90                          */
  91                         if (newskb)
  92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                         net, sk, newskb, NULL, newskb->dev,
  94                                         dev_loopback_xmit);
  95
  96                         if (hdr->hop_limit == 0) {
  97                                 IP6_INC_STATS(net, idev,
  98                                               IPSTATS_MIB_OUTDISCARDS);
  99                                 kfree_skb(skb);
 100                                 return 0;
 101                         }
 102                 }
 103
 104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                     !(dev->flags & IFF_LOOPBACK)) {
 107                         kfree_skb(skb);
 108                         return 0;
 109                 }
 110         }
 111
 112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                 int res = lwtunnel_xmit(skb);
 114
 115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                         return res;
 117         }
 118
 119         rcu_read_lock_bh();
 120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122         if (unlikely(!neigh))
 123                 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124         if (!IS_ERR(neigh)) {
 125                 sock_confirm_neigh(skb, neigh);
 126                 ret = neigh_output(neigh, skb, false);
 127                 rcu_read_unlock_bh();
 128                 return ret;
 129         }
 130         rcu_read_unlock_bh();
 131
 132         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb(skb);
 134         return -EINVAL;
 135 }
 136
 137 static int
 138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                     struct sk_buff *skb, unsigned int mtu)
 140 {
 141         struct sk_buff *segs, *nskb;
 142         netdev_features_t features;
 143         int ret = 0;
 144
 145         /* Please see corresponding comment in ip_finish_output_gso
 146          * describing the cases where GSO segment length exceeds the
 147          * egress MTU.
 148          */
 149         features = netif_skb_features(skb);
 150         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151         if (IS_ERR_OR_NULL(segs)) {
 152                 kfree_skb(skb);
 153                 return -ENOMEM;
 154         }
 155
 156         consume_skb(skb);
 157
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 int err;
 160
 161                 skb_mark_not_on_list(segs);
 162                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                 if (err && ret == 0)
 164                         ret = err;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171 {
 172         unsigned int mtu;
 173
 174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175         /* Policy lookup after SNAT yielded a new policy */
 176         if (skb_dst(skb)->xfrm) {
 177                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 178                 return dst_output(net, sk, skb);
 179         }
 180 #endif
 181
 182         mtu = ip6_skb_dst_mtu(skb);
 183         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187             dst_allfrag(skb_dst(skb)) ||
 188             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190         else
 191                 return ip6_finish_output2(net, sk, skb);
 192 }
 193
 194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195 {
 196         int ret;
 197
 198         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199         switch (ret) {
 200         case NET_XMIT_SUCCESS:
 201                 return __ip6_finish_output(net, sk, skb);
 202         case NET_XMIT_CN:
 203                 return __ip6_finish_output(net, sk, skb) ? : ret;
 204         default:
 205                 kfree_skb(skb);
 206                 return ret;
 207         }
 208 }
 209
 210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211 {
 212         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215         skb->protocol = htons(ETH_P_IPV6);
 216         skb->dev = dev;
 217
 218         if (unlikely(idev->cnf.disable_ipv6)) {
 219                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                 kfree_skb(skb);
 221                 return 0;
 222         }
 223
 224         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                             net, sk, skb, indev, dev,
 226                             ip6_finish_output,
 227                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228 }
 229 EXPORT_SYMBOL(ip6_output);
 230
 231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232 {
 233         if (!np->autoflowlabel_set)
 234                 return ip6_default_np_autolabel(net);
 235         else
 236                 return np->autoflowlabel;
 237 }
 238
 239 /*
 240  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241  * Note : socket lock is not held for SYNACK packets, but might be modified
 242  * by calls to skb_set_owner_w() and ipv6_local_error(),
 243  * which are using proper atomic operations or spinlocks.
 244  */
 245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247 {
 248         struct net *net = sock_net(sk);
 249         const struct ipv6_pinfo *np = inet6_sk(sk);
 250         struct in6_addr *first_hop = &fl6->daddr;
 251         struct dst_entry *dst = skb_dst(skb);
 252         struct net_device *dev = dst->dev;
 253         struct inet6_dev *idev = ip6_dst_idev(dst);
 254         unsigned int head_room;
 255         struct ipv6hdr *hdr;
 256         u8  proto = fl6->flowi6_proto;
 257         int seg_len = skb->len;
 258         int hlimit = -1;
 259         u32 mtu;
 260
 261         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262         if (opt)
 263                 head_room += opt->opt_nflen + opt->opt_flen;
 264
 265         if (unlikely(head_room > skb_headroom(skb))) {
 266                 skb = skb_expand_head(skb, head_room);
 267                 if (!skb) {
 268                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                         return -ENOBUFS;
 270                 }
 271         }
 272
 273         if (opt) {
 274                 seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                 if (opt->opt_flen)
 277                         ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                 if (opt->opt_nflen)
 280                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                              &fl6->saddr);
 282         }
 283
 284         skb_push(skb, sizeof(struct ipv6hdr));
 285         skb_reset_network_header(skb);
 286         hdr = ipv6_hdr(skb);
 287
 288         /*
 289          *      Fill in the IPv6 header
 290          */
 291         if (np)
 292                 hlimit = np->hop_limit;
 293         if (hlimit < 0)
 294                 hlimit = ip6_dst_hoplimit(dst);
 295
 296         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                 ip6_autoflowlabel(net, np), fl6));
 298
 299         hdr->payload_len = htons(seg_len);
 300         hdr->nexthdr = proto;
 301         hdr->hop_limit = hlimit;
 302
 303         hdr->saddr = fl6->saddr;
 304         hdr->daddr = *first_hop;
 305
 306         skb->protocol = htons(ETH_P_IPV6);
 307         skb->priority = priority;
 308         skb->mark = mark;
 309
 310         mtu = dst_mtu(dst);
 311         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                 /* if egress device is enslaved to an L3 master device pass the
 315                  * skb to its handler for processing
 316                  */
 317                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                 if (unlikely(!skb))
 319                         return 0;
 320
 321                 /* hooks should never assume socket lock is held.
 322                  * we promote our socket to non const
 323                  */
 324                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                                net, (struct sock *)sk, skb, NULL, dev,
 326                                dst_output);
 327         }
 328
 329         skb->dev = dev;
 330         /* ipv6_local_error() does not require socket lock,
 331          * we promote our socket to non const
 332          */
 333         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336         kfree_skb(skb);
 337         return -EMSGSIZE;
 338 }
 339 EXPORT_SYMBOL(ip6_xmit);
 340
 341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342 {
 343         struct ip6_ra_chain *ra;
 344         struct sock *last = NULL;
 345
 346         read_lock(&ip6_ra_lock);
 347         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                 struct sock *sk = ra->sk;
 349                 if (sk && ra->sel == sel &&
 350                     (!sk->sk_bound_dev_if ||
 351                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                         struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                         if (np && np->rtalert_isolate &&
 355                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                 continue;
 357                         }
 358                         if (last) {
 359                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                 if (skb2)
 361                                         rawv6_rcv(last, skb2);
 362                         }
 363                         last = sk;
 364                 }
 365         }
 366
 367         if (last) {
 368                 rawv6_rcv(last, skb);
 369                 read_unlock(&ip6_ra_lock);
 370                 return 1;
 371         }
 372         read_unlock(&ip6_ra_lock);
 373         return 0;
 374 }
 375
 376 static int ip6_forward_proxy_check(struct sk_buff *skb)
 377 {
 378         struct ipv6hdr *hdr = ipv6_hdr(skb);
 379         u8 nexthdr = hdr->nexthdr;
 380         __be16 frag_off;
 381         int offset;
 382
 383         if (ipv6_ext_hdr(nexthdr)) {
 384                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                 if (offset < 0)
 386                         return 0;
 387         } else
 388                 offset = sizeof(struct ipv6hdr);
 389
 390         if (nexthdr == IPPROTO_ICMPV6) {
 391                 struct icmp6hdr *icmp6;
 392
 393                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                          offset + 1 - skb->data)))
 395                         return 0;
 396
 397                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                 switch (icmp6->icmp6_type) {
 400                 case NDISC_ROUTER_SOLICITATION:
 401                 case NDISC_ROUTER_ADVERTISEMENT:
 402                 case NDISC_NEIGHBOUR_SOLICITATION:
 403                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                 case NDISC_REDIRECT:
 405                         /* For reaction involving unicast neighbor discovery
 406                          * message destined to the proxied address, pass it to
 407                          * input function.
 408                          */
 409                         return 1;
 410                 default:
 411                         break;
 412                 }
 413         }
 414
 415         /*
 416          * The proxying router can't forward traffic sent to a link-local
 417          * address, so signal the sender and discard the packet. This
 418          * behavior is clarified by the MIPv6 specification.
 419          */
 420         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                 dst_link_failure(skb);
 422                 return -1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                      struct sk_buff *skb)
 430 {
 431         struct dst_entry *dst = skb_dst(skb);
 432
 433         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436 #ifdef CONFIG_NET_SWITCHDEV
 437         if (skb->offload_l3_fwd_mark) {
 438                 consume_skb(skb);
 439                 return 0;
 440         }
 441 #endif
 442
 443         skb->tstamp = 0;
 444         return dst_output(net, sk, skb);
 445 }
 446
 447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448 {
 449         if (skb->len <= mtu)
 450                 return false;
 451
 452         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                 return true;
 455
 456         if (skb->ignore_df)
 457                 return false;
 458
 459         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                 return false;
 461
 462         return true;
 463 }
 464
 465 int ip6_forward(struct sk_buff *skb)
 466 {
 467         struct dst_entry *dst = skb_dst(skb);
 468         struct ipv6hdr *hdr = ipv6_hdr(skb);
 469         struct inet6_skb_parm *opt = IP6CB(skb);
 470         struct net *net = dev_net(dst->dev);
 471         struct inet6_dev *idev;
 472         u32 mtu;
 473
 474         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 475         if (net->ipv6.devconf_all->forwarding == 0)
 476                 goto error;
 477
 478         if (skb->pkt_type != PACKET_HOST)
 479                 goto drop;
 480
 481         if (unlikely(skb->sk))
 482                 goto drop;
 483
 484         if (skb_warn_if_lro(skb))
 485                 goto drop;
 486
 487         if (!net->ipv6.devconf_all->disable_policy &&
 488             (!idev || !idev->cnf.disable_policy) &&
 489             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 490                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 491                 goto drop;
 492         }
 493
 494         skb_forward_csum(skb);
 495
 496         /*
 497          *      We DO NOT make any processing on
 498          *      RA packets, pushing them to user level AS IS
 499          *      without ane WARRANTY that application will be able
 500          *      to interpret them. The reason is that we
 501          *      cannot make anything clever here.
 502          *
 503          *      We are not end-node, so that if packet contains
 504          *      AH/ESP, we cannot make anything.
 505          *      Defragmentation also would be mistake, RA packets
 506          *      cannot be fragmented, because there is no warranty
 507          *      that different fragments will go along one path. --ANK
 508          */
 509         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 510                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 511                         return 0;
 512         }
 513
 514         /*
 515          *      check and decrement ttl
 516          */
 517         if (hdr->hop_limit <= 1) {
 518                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 520
 521                 kfree_skb(skb);
 522                 return -ETIMEDOUT;
 523         }
 524
 525         /* XXX: idev->cnf.proxy_ndp? */
 526         if (net->ipv6.devconf_all->proxy_ndp &&
 527             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 528                 int proxied = ip6_forward_proxy_check(skb);
 529                 if (proxied > 0) {
 530                         hdr->hop_limit--;
 531                         return ip6_input(skb);
 532                 } else if (proxied < 0) {
 533                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 534                         goto drop;
 535                 }
 536         }
 537
 538         if (!xfrm6_route_forward(skb)) {
 539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 540                 goto drop;
 541         }
 542         dst = skb_dst(skb);
 543
 544         /* IPv6 specs say nothing about it, but it is clear that we cannot
 545            send redirects to source routed frames.
 546            We don't send redirects to frames decapsulated from IPsec.
 547          */
 548         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 549             opt->srcrt == 0 && !skb_sec_path(skb)) {
 550                 struct in6_addr *target = NULL;
 551                 struct inet_peer *peer;
 552                 struct rt6_info *rt;
 553
 554                 /*
 555                  *      incoming and outgoing devices are the same
 556                  *      send a redirect.
 557                  */
 558
 559                 rt = (struct rt6_info *) dst;
 560                 if (rt->rt6i_flags & RTF_GATEWAY)
 561                         target = &rt->rt6i_gateway;
 562                 else
 563                         target = &hdr->daddr;
 564
 565                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 566
 567                 /* Limit redirects both by destination (here)
 568                    and by source (inside ndisc_send_redirect)
 569                  */
 570                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 571                         ndisc_send_redirect(skb, target);
 572                 if (peer)
 573                         inet_putpeer(peer);
 574         } else {
 575                 int addrtype = ipv6_addr_type(&hdr->saddr);
 576
 577                 /* This check is security critical. */
 578                 if (addrtype == IPV6_ADDR_ANY ||
 579                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 580                         goto error;
 581                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 582                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 583                                     ICMPV6_NOT_NEIGHBOUR, 0);
 584                         goto error;
 585                 }
 586         }
 587
 588         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 589         if (mtu < IPV6_MIN_MTU)
 590                 mtu = IPV6_MIN_MTU;
 591
 592         if (ip6_pkt_too_big(skb, mtu)) {
 593                 /* Again, force OUTPUT device used as source address */
 594                 skb->dev = dst->dev;
 595                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 596                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 597                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 598                                 IPSTATS_MIB_FRAGFAILS);
 599                 kfree_skb(skb);
 600                 return -EMSGSIZE;
 601         }
 602
 603         if (skb_cow(skb, dst->dev->hard_header_len)) {
 604                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 605                                 IPSTATS_MIB_OUTDISCARDS);
 606                 goto drop;
 607         }
 608
 609         hdr = ipv6_hdr(skb);
 610
 611         /* Mangling hops number delayed to point after skb COW */
 612
 613         hdr->hop_limit--;
 614
 615         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 616                        net, NULL, skb, skb->dev, dst->dev,
 617                        ip6_forward_finish);
 618
 619 error:
 620         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 621 drop:
 622         kfree_skb(skb);
 623         return -EINVAL;
 624 }
 625
 626 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 627 {
 628         to->pkt_type = from->pkt_type;
 629         to->priority = from->priority;
 630         to->protocol = from->protocol;
 631         skb_dst_drop(to);
 632         skb_dst_set(to, dst_clone(skb_dst(from)));
 633         to->dev = from->dev;
 634         to->mark = from->mark;
 635
 636         skb_copy_hash(to, from);
 637
 638 #ifdef CONFIG_NET_SCHED
 639         to->tc_index = from->tc_index;
 640 #endif
 641         nf_copy(to, from);
 642         skb_ext_copy(to, from);
 643         skb_copy_secmark(to, from);
 644 }
 645
 646 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 647                       u8 nexthdr, __be32 frag_id,
 648                       struct ip6_fraglist_iter *iter)
 649 {
 650         unsigned int first_len;
 651         struct frag_hdr *fh;
 652
 653         /* BUILD HEADER */
 654         *prevhdr = NEXTHDR_FRAGMENT;
 655         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 656         if (!iter->tmp_hdr)
 657                 return -ENOMEM;
 658
 659         iter->frag = skb_shinfo(skb)->frag_list;
 660         skb_frag_list_init(skb);
 661
 662         iter->offset = 0;
 663         iter->hlen = hlen;
 664         iter->frag_id = frag_id;
 665         iter->nexthdr = nexthdr;
 666
 667         __skb_pull(skb, hlen);
 668         fh = __skb_push(skb, sizeof(struct frag_hdr));
 669         __skb_push(skb, hlen);
 670         skb_reset_network_header(skb);
 671         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 672
 673         fh->nexthdr = nexthdr;
 674         fh->reserved = 0;
 675         fh->frag_off = htons(IP6_MF);
 676         fh->identification = frag_id;
 677
 678         first_len = skb_pagelen(skb);
 679         skb->data_len = first_len - skb_headlen(skb);
 680         skb->len = first_len;
 681         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 682
 683         return 0;
 684 }
 685 EXPORT_SYMBOL(ip6_fraglist_init);
 686
 687 void ip6_fraglist_prepare(struct sk_buff *skb,
 688                           struct ip6_fraglist_iter *iter)
 689 {
 690         struct sk_buff *frag = iter->frag;
 691         unsigned int hlen = iter->hlen;
 692         struct frag_hdr *fh;
 693
 694         frag->ip_summed = CHECKSUM_NONE;
 695         skb_reset_transport_header(frag);
 696         fh = __skb_push(frag, sizeof(struct frag_hdr));
 697         __skb_push(frag, hlen);
 698         skb_reset_network_header(frag);
 699         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 700         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 701         fh->nexthdr = iter->nexthdr;
 702         fh->reserved = 0;
 703         fh->frag_off = htons(iter->offset);
 704         if (frag->next)
 705                 fh->frag_off |= htons(IP6_MF);
 706         fh->identification = iter->frag_id;
 707         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 708         ip6_copy_metadata(frag, skb);
 709 }
 710 EXPORT_SYMBOL(ip6_fraglist_prepare);
 711
 712 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 713                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 714                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 715 {
 716         state->prevhdr = prevhdr;
 717         state->nexthdr = nexthdr;
 718         state->frag_id = frag_id;
 719
 720         state->hlen = hlen;
 721         state->mtu = mtu;
 722
 723         state->left = skb->len - hlen;  /* Space per frame */
 724         state->ptr = hlen;              /* Where to start from */
 725
 726         state->hroom = hdr_room;
 727         state->troom = needed_tailroom;
 728
 729         state->offset = 0;
 730 }
 731 EXPORT_SYMBOL(ip6_frag_init);
 732
 733 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 734 {
 735         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 736         struct sk_buff *frag;
 737         struct frag_hdr *fh;
 738         unsigned int len;
 739
 740         len = state->left;
 741         /* IF: it doesn't fit, use 'mtu' - the data space left */
 742         if (len > state->mtu)
 743                 len = state->mtu;
 744         /* IF: we are not sending up to and including the packet end
 745            then align the next start on an eight byte boundary */
 746         if (len < state->left)
 747                 len &= ~7;
 748
 749         /* Allocate buffer */
 750         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 751                          state->hroom + state->troom, GFP_ATOMIC);
 752         if (!frag)
 753                 return ERR_PTR(-ENOMEM);
 754
 755         /*
 756          *      Set up data on packet
 757          */
 758
 759         ip6_copy_metadata(frag, skb);
 760         skb_reserve(frag, state->hroom);
 761         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 762         skb_reset_network_header(frag);
 763         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 764         frag->transport_header = (frag->network_header + state->hlen +
 765                                   sizeof(struct frag_hdr));
 766
 767         /*
 768          *      Charge the memory for the fragment to any owner
 769          *      it might possess
 770          */
 771         if (skb->sk)
 772                 skb_set_owner_w(frag, skb->sk);
 773
 774         /*
 775          *      Copy the packet header into the new buffer.
 776          */
 777         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 778
 779         fragnexthdr_offset = skb_network_header(frag);
 780         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 781         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 782
 783         /*
 784          *      Build fragment header.
 785          */
 786         fh->nexthdr = state->nexthdr;
 787         fh->reserved = 0;
 788         fh->identification = state->frag_id;
 789
 790         /*
 791          *      Copy a block of the IP datagram.
 792          */
 793         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 794                              len));
 795         state->left -= len;
 796
 797         fh->frag_off = htons(state->offset);
 798         if (state->left > 0)
 799                 fh->frag_off |= htons(IP6_MF);
 800         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 801
 802         state->ptr += len;
 803         state->offset += len;
 804
 805         return frag;
 806 }
 807 EXPORT_SYMBOL(ip6_frag_next);
 808
 809 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 810                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 811 {
 812         struct sk_buff *frag;
 813         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 814         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 815                                 inet6_sk(skb->sk) : NULL;
 816         struct ip6_frag_state state;
 817         unsigned int mtu, hlen, nexthdr_offset;
 818         ktime_t tstamp = skb->tstamp;
 819         int hroom, err = 0;
 820         __be32 frag_id;
 821         u8 *prevhdr, nexthdr = 0;
 822
 823         err = ip6_find_1stfragopt(skb, &prevhdr);
 824         if (err < 0)
 825                 goto fail;
 826         hlen = err;
 827         nexthdr = *prevhdr;
 828         nexthdr_offset = prevhdr - skb_network_header(skb);
 829
 830         mtu = ip6_skb_dst_mtu(skb);
 831
 832         /* We must not fragment if the socket is set to force MTU discovery
 833          * or if the skb it not generated by a local socket.
 834          */
 835         if (unlikely(!skb->ignore_df && skb->len > mtu))
 836                 goto fail_toobig;
 837
 838         if (IP6CB(skb)->frag_max_size) {
 839                 if (IP6CB(skb)->frag_max_size > mtu)
 840                         goto fail_toobig;
 841
 842                 /* don't send fragments larger than what we received */
 843                 mtu = IP6CB(skb)->frag_max_size;
 844                 if (mtu < IPV6_MIN_MTU)
 845                         mtu = IPV6_MIN_MTU;
 846         }
 847
 848         if (np && np->frag_size < mtu) {
 849                 if (np->frag_size)
 850                         mtu = np->frag_size;
 851         }
 852         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 853                 goto fail_toobig;
 854         mtu -= hlen + sizeof(struct frag_hdr);
 855
 856         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 857                                     &ipv6_hdr(skb)->saddr);
 858
 859         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 860             (err = skb_checksum_help(skb)))
 861                 goto fail;
 862
 863         prevhdr = skb_network_header(skb) + nexthdr_offset;
 864         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 865         if (skb_has_frag_list(skb)) {
 866                 unsigned int first_len = skb_pagelen(skb);
 867                 struct ip6_fraglist_iter iter;
 868                 struct sk_buff *frag2;
 869
 870                 if (first_len - hlen > mtu ||
 871                     ((first_len - hlen) & 7) ||
 872                     skb_cloned(skb) ||
 873                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 874                         goto slow_path;
 875
 876                 skb_walk_frags(skb, frag) {
 877                         /* Correct geometry. */
 878                         if (frag->len > mtu ||
 879                             ((frag->len & 7) && frag->next) ||
 880                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 881                                 goto slow_path_clean;
 882
 883                         /* Partially cloned skb? */
 884                         if (skb_shared(frag))
 885                                 goto slow_path_clean;
 886
 887                         BUG_ON(frag->sk);
 888                         if (skb->sk) {
 889                                 frag->sk = skb->sk;
 890                                 frag->destructor = sock_wfree;
 891                         }
 892                         skb->truesize -= frag->truesize;
 893                 }
 894
 895                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 896                                         &iter);
 897                 if (err < 0)
 898                         goto fail;
 899
 900                 /* We prevent @rt from being freed. */
 901                 rcu_read_lock();
 902
 903                 for (;;) {
 904                         /* Prepare header of the next frame,
 905                          * before previous one went down. */
 906                         if (iter.frag)
 907                                 ip6_fraglist_prepare(skb, &iter);
 908
 909                         skb->tstamp = tstamp;
 910                         err = output(net, sk, skb);
 911                         if (!err)
 912                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 913                                               IPSTATS_MIB_FRAGCREATES);
 914
 915                         if (err || !iter.frag)
 916                                 break;
 917
 918                         skb = ip6_fraglist_next(&iter);
 919                 }
 920
 921                 kfree(iter.tmp_hdr);
 922
 923                 if (err == 0) {
 924                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 925                                       IPSTATS_MIB_FRAGOKS);
 926                         rcu_read_unlock();
 927                         return 0;
 928                 }
 929
 930                 kfree_skb_list(iter.frag);
 931
 932                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 933                               IPSTATS_MIB_FRAGFAILS);
 934                 rcu_read_unlock();
 935                 return err;
 936
 937 slow_path_clean:
 938                 skb_walk_frags(skb, frag2) {
 939                         if (frag2 == frag)
 940                                 break;
 941                         frag2->sk = NULL;
 942                         frag2->destructor = NULL;
 943                         skb->truesize += frag2->truesize;
 944                 }
 945         }
 946
 947 slow_path:
 948         /*
 949          *      Fragment the datagram.
 950          */
 951
 952         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 953                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 954                       &state);
 955
 956         /*
 957          *      Keep copying data until we run out.
 958          */
 959
 960         while (state.left > 0) {
 961                 frag = ip6_frag_next(skb, &state);
 962                 if (IS_ERR(frag)) {
 963                         err = PTR_ERR(frag);
 964                         goto fail;
 965                 }
 966
 967                 /*
 968                  *      Put this fragment into the sending queue.
 969                  */
 970                 frag->tstamp = tstamp;
 971                 err = output(net, sk, frag);
 972                 if (err)
 973                         goto fail;
 974
 975                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 976                               IPSTATS_MIB_FRAGCREATES);
 977         }
 978         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 979                       IPSTATS_MIB_FRAGOKS);
 980         consume_skb(skb);
 981         return err;
 982
 983 fail_toobig:
 984         if (skb->sk && dst_allfrag(skb_dst(skb)))
 985                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 986
 987         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 988         err = -EMSGSIZE;
 989
 990 fail:
 991         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 992                       IPSTATS_MIB_FRAGFAILS);
 993         kfree_skb(skb);
 994         return err;
 995 }
 996
 997 static inline int ip6_rt_check(const struct rt6key *rt_key,
 998                                const struct in6_addr *fl_addr,
 999                                const struct in6_addr *addr_cache)
1000 {
1001         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1002                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1003 }
1004
1005 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1006                                           struct dst_entry *dst,
1007                                           const struct flowi6 *fl6)
1008 {
1009         struct ipv6_pinfo *np = inet6_sk(sk);
1010         struct rt6_info *rt;
1011
1012         if (!dst)
1013                 goto out;
1014
1015         if (dst->ops->family != AF_INET6) {
1016                 dst_release(dst);
1017                 return NULL;
1018         }
1019
1020         rt = (struct rt6_info *)dst;
1021         /* Yes, checking route validity in not connected
1022          * case is not very simple. Take into account,
1023          * that we do not support routing by source, TOS,
1024          * and MSG_DONTROUTE            --ANK (980726)
1025          *
1026          * 1. ip6_rt_check(): If route was host route,
1027          *    check that cached destination is current.
1028          *    If it is network route, we still may
1029          *    check its validity using saved pointer
1030          *    to the last used address: daddr_cache.
1031          *    We do not want to save whole address now,
1032          *    (because main consumer of this service
1033          *    is tcp, which has not this problem),
1034          *    so that the last trick works only on connected
1035          *    sockets.
1036          * 2. oif also should be the same.
1037          */
1038         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1039 #ifdef CONFIG_IPV6_SUBTREES
1040             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1041 #endif
1042            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1043               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1044                 dst_release(dst);
1045                 dst = NULL;
1046         }
1047
1048 out:
1049         return dst;
1050 }
1051
1052 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1053                                struct dst_entry **dst, struct flowi6 *fl6)
1054 {
1055 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1056         struct neighbour *n;
1057         struct rt6_info *rt;
1058 #endif
1059         int err;
1060         int flags = 0;
1061
1062         /* The correct way to handle this would be to do
1063          * ip6_route_get_saddr, and then ip6_route_output; however,
1064          * the route-specific preferred source forces the
1065          * ip6_route_output call _before_ ip6_route_get_saddr.
1066          *
1067          * In source specific routing (no src=any default route),
1068          * ip6_route_output will fail given src=any saddr, though, so
1069          * that's why we try it again later.
1070          */
1071         if (ipv6_addr_any(&fl6->saddr)) {
1072                 struct fib6_info *from;
1073                 struct rt6_info *rt;
1074
1075                 *dst = ip6_route_output(net, sk, fl6);
1076                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1077
1078                 rcu_read_lock();
1079                 from = rt ? rcu_dereference(rt->from) : NULL;
1080                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1081                                           sk ? inet6_sk(sk)->srcprefs : 0,
1082                                           &fl6->saddr);
1083                 rcu_read_unlock();
1084
1085                 if (err)
1086                         goto out_err_release;
1087
1088                 /* If we had an erroneous initial result, pretend it
1089                  * never existed and let the SA-enabled version take
1090                  * over.
1091                  */
1092                 if ((*dst)->error) {
1093                         dst_release(*dst);
1094                         *dst = NULL;
1095                 }
1096
1097                 if (fl6->flowi6_oif)
1098                         flags |= RT6_LOOKUP_F_IFACE;
1099         }
1100
1101         if (!*dst)
1102                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1103
1104         err = (*dst)->error;
1105         if (err)
1106                 goto out_err_release;
1107
1108 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1109         /*
1110          * Here if the dst entry we've looked up
1111          * has a neighbour entry that is in the INCOMPLETE
1112          * state and the src address from the flow is
1113          * marked as OPTIMISTIC, we release the found
1114          * dst entry and replace it instead with the
1115          * dst entry of the nexthop router
1116          */
1117         rt = (struct rt6_info *) *dst;
1118         rcu_read_lock_bh();
1119         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1120                                       rt6_nexthop(rt, &fl6->daddr));
1121         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1122         rcu_read_unlock_bh();
1123
1124         if (err) {
1125                 struct inet6_ifaddr *ifp;
1126                 struct flowi6 fl_gw6;
1127                 int redirect;
1128
1129                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1130                                       (*dst)->dev, 1);
1131
1132                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1133                 if (ifp)
1134                         in6_ifa_put(ifp);
1135
1136                 if (redirect) {
1137                         /*
1138                          * We need to get the dst entry for the
1139                          * default router instead
1140                          */
1141                         dst_release(*dst);
1142                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1143                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1144                         *dst = ip6_route_output(net, sk, &fl_gw6);
1145                         err = (*dst)->error;
1146                         if (err)
1147                                 goto out_err_release;
1148                 }
1149         }
1150 #endif
1151         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1152             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1153                 err = -EAFNOSUPPORT;
1154                 goto out_err_release;
1155         }
1156
1157         return 0;
1158
1159 out_err_release:
1160         dst_release(*dst);
1161         *dst = NULL;
1162
1163         if (err == -ENETUNREACH)
1164                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1165         return err;
1166 }
1167
1168 /**
1169  *      ip6_dst_lookup - perform route lookup on flow
1170  *      @net: Network namespace to perform lookup in
1171  *      @sk: socket which provides route info
1172  *      @dst: pointer to dst_entry * for result
1173  *      @fl6: flow to lookup
1174  *
1175  *      This function performs a route lookup on the given flow.
1176  *
1177  *      It returns zero on success, or a standard errno code on error.
1178  */
1179 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1180                    struct flowi6 *fl6)
1181 {
1182         *dst = NULL;
1183         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1184 }
1185 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1186
1187 /**
1188  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1189  *      @net: Network namespace to perform lookup in
1190  *      @sk: socket which provides route info
1191  *      @fl6: flow to lookup
1192  *      @final_dst: final destination address for ipsec lookup
1193  *
1194  *      This function performs a route lookup on the given flow.
1195  *
1196  *      It returns a valid dst pointer on success, or a pointer encoded
1197  *      error code.
1198  */
1199 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1200                                       const struct in6_addr *final_dst)
1201 {
1202         struct dst_entry *dst = NULL;
1203         int err;
1204
1205         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1206         if (err)
1207                 return ERR_PTR(err);
1208         if (final_dst)
1209                 fl6->daddr = *final_dst;
1210
1211         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1212 }
1213 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1214
1215 /**
1216  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1217  *      @sk: socket which provides the dst cache and route info
1218  *      @fl6: flow to lookup
1219  *      @final_dst: final destination address for ipsec lookup
1220  *      @connected: whether @sk is connected or not
1221  *
1222  *      This function performs a route lookup on the given flow with the
1223  *      possibility of using the cached route in the socket if it is valid.
1224  *      It will take the socket dst lock when operating on the dst cache.
1225  *      As a result, this function can only be used in process context.
1226  *
1227  *      In addition, for a connected socket, cache the dst in the socket
1228  *      if the current cache is not valid.
1229  *
1230  *      It returns a valid dst pointer on success, or a pointer encoded
1231  *      error code.
1232  */
1233 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1234                                          const struct in6_addr *final_dst,
1235                                          bool connected)
1236 {
1237         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1238
1239         dst = ip6_sk_dst_check(sk, dst, fl6);
1240         if (dst)
1241                 return dst;
1242
1243         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1244         if (connected && !IS_ERR(dst))
1245                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1246
1247         return dst;
1248 }
1249 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1250
1251 /**
1252  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1253  *      @skb: Packet for which lookup is done
1254  *      @dev: Tunnel device
1255  *      @net: Network namespace of tunnel device
1256  *      @sock: Socket which provides route info
1257  *      @saddr: Memory to store the src ip address
1258  *      @info: Tunnel information
1259  *      @protocol: IP protocol
1260  *      @use_cache: Flag to enable cache usage
1261  *      This function performs a route lookup on a tunnel
1262  *
1263  *      It returns a valid dst pointer and stores src address to be used in
1264  *      tunnel in param saddr on success, else a pointer encoded error code.
1265  */
1266
1267 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1268                                         struct net_device *dev,
1269                                         struct net *net,
1270                                         struct socket *sock,
1271                                         struct in6_addr *saddr,
1272                                         const struct ip_tunnel_info *info,
1273                                         u8 protocol,
1274                                         bool use_cache)
1275 {
1276         struct dst_entry *dst = NULL;
1277 #ifdef CONFIG_DST_CACHE
1278         struct dst_cache *dst_cache;
1279 #endif
1280         struct flowi6 fl6;
1281         __u8 prio;
1282
1283 #ifdef CONFIG_DST_CACHE
1284         dst_cache = (struct dst_cache *)&info->dst_cache;
1285         if (use_cache) {
1286                 dst = dst_cache_get_ip6(dst_cache, saddr);
1287                 if (dst)
1288                         return dst;
1289         }
1290 #endif
1291         memset(&fl6, 0, sizeof(fl6));
1292         fl6.flowi6_mark = skb->mark;
1293         fl6.flowi6_proto = protocol;
1294         fl6.daddr = info->key.u.ipv6.dst;
1295         fl6.saddr = info->key.u.ipv6.src;
1296         prio = info->key.tos;
1297         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1298
1299         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1300                                               NULL);
1301         if (IS_ERR(dst)) {
1302                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1303                 return ERR_PTR(-ENETUNREACH);
1304         }
1305         if (dst->dev == dev) { /* is this necessary? */
1306                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1307                 dst_release(dst);
1308                 return ERR_PTR(-ELOOP);
1309         }
1310 #ifdef CONFIG_DST_CACHE
1311         if (use_cache)
1312                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1313 #endif
1314         *saddr = fl6.saddr;
1315         return dst;
1316 }
1317 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1318
1319 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1320                                                gfp_t gfp)
1321 {
1322         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1323 }
1324
1325 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1326                                                 gfp_t gfp)
1327 {
1328         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1329 }
1330
1331 static void ip6_append_data_mtu(unsigned int *mtu,
1332                                 int *maxfraglen,
1333                                 unsigned int fragheaderlen,
1334                                 struct sk_buff *skb,
1335                                 struct rt6_info *rt,
1336                                 unsigned int orig_mtu)
1337 {
1338         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1339                 if (!skb) {
1340                         /* first fragment, reserve header_len */
1341                         *mtu = orig_mtu - rt->dst.header_len;
1342
1343                 } else {
1344                         /*
1345                          * this fragment is not first, the headers
1346                          * space is regarded as data space.
1347                          */
1348                         *mtu = orig_mtu;
1349                 }
1350                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1351                               + fragheaderlen - sizeof(struct frag_hdr);
1352         }
1353 }
1354
1355 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1356                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1357                           struct rt6_info *rt, struct flowi6 *fl6)
1358 {
1359         struct ipv6_pinfo *np = inet6_sk(sk);
1360         unsigned int mtu;
1361         struct ipv6_txoptions *opt = ipc6->opt;
1362
1363         /*
1364          * setup for corking
1365          */
1366         if (opt) {
1367                 if (WARN_ON(v6_cork->opt))
1368                         return -EINVAL;
1369
1370                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1371                 if (unlikely(!v6_cork->opt))
1372                         return -ENOBUFS;
1373
1374                 v6_cork->opt->tot_len = sizeof(*opt);
1375                 v6_cork->opt->opt_flen = opt->opt_flen;
1376                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1377
1378                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1379                                                     sk->sk_allocation);
1380                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1381                         return -ENOBUFS;
1382
1383                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1384                                                     sk->sk_allocation);
1385                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1386                         return -ENOBUFS;
1387
1388                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1389                                                    sk->sk_allocation);
1390                 if (opt->hopopt && !v6_cork->opt->hopopt)
1391                         return -ENOBUFS;
1392
1393                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1394                                                     sk->sk_allocation);
1395                 if (opt->srcrt && !v6_cork->opt->srcrt)
1396                         return -ENOBUFS;
1397
1398                 /* need source address above miyazawa*/
1399         }
1400         dst_hold(&rt->dst);
1401         cork->base.dst = &rt->dst;
1402         cork->fl.u.ip6 = *fl6;
1403         v6_cork->hop_limit = ipc6->hlimit;
1404         v6_cork->tclass = ipc6->tclass;
1405         if (rt->dst.flags & DST_XFRM_TUNNEL)
1406                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1407                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1408         else
1409                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1410                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1411         if (np->frag_size < mtu) {
1412                 if (np->frag_size)
1413                         mtu = np->frag_size;
1414         }
1415         cork->base.fragsize = mtu;
1416         cork->base.gso_size = ipc6->gso_size;
1417         cork->base.tx_flags = 0;
1418         cork->base.mark = ipc6->sockc.mark;
1419         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1420
1421         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1422                 cork->base.flags |= IPCORK_ALLFRAG;
1423         cork->base.length = 0;
1424
1425         cork->base.transmit_time = ipc6->sockc.transmit_time;
1426
1427         return 0;
1428 }
1429
1430 static int __ip6_append_data(struct sock *sk,
1431                              struct flowi6 *fl6,
1432                              struct sk_buff_head *queue,
1433                              struct inet_cork *cork,
1434                              struct inet6_cork *v6_cork,
1435                              struct page_frag *pfrag,
1436                              int getfrag(void *from, char *to, int offset,
1437                                          int len, int odd, struct sk_buff *skb),
1438                              void *from, int length, int transhdrlen,
1439                              unsigned int flags, struct ipcm6_cookie *ipc6)
1440 {
1441         struct sk_buff *skb, *skb_prev = NULL;
1442         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1443         struct ubuf_info *uarg = NULL;
1444         int exthdrlen = 0;
1445         int dst_exthdrlen = 0;
1446         int hh_len;
1447         int copy;
1448         int err;
1449         int offset = 0;
1450         u32 tskey = 0;
1451         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1452         struct ipv6_txoptions *opt = v6_cork->opt;
1453         int csummode = CHECKSUM_NONE;
1454         unsigned int maxnonfragsize, headersize;
1455         unsigned int wmem_alloc_delta = 0;
1456         bool paged, extra_uref = false;
1457
1458         skb = skb_peek_tail(queue);
1459         if (!skb) {
1460                 exthdrlen = opt ? opt->opt_flen : 0;
1461                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1462         }
1463
1464         paged = !!cork->gso_size;
1465         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1466         orig_mtu = mtu;
1467
1468         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1469             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1470                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1471
1472         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1473
1474         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1475                         (opt ? opt->opt_nflen : 0);
1476
1477         headersize = sizeof(struct ipv6hdr) +
1478                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1479                      (dst_allfrag(&rt->dst) ?
1480                       sizeof(struct frag_hdr) : 0) +
1481                      rt->rt6i_nfheader_len;
1482
1483         if (mtu <= fragheaderlen ||
1484             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1485                 goto emsgsize;
1486
1487         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1488                      sizeof(struct frag_hdr);
1489
1490         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1491          * the first fragment
1492          */
1493         if (headersize + transhdrlen > mtu)
1494                 goto emsgsize;
1495
1496         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1497             (sk->sk_protocol == IPPROTO_UDP ||
1498              sk->sk_protocol == IPPROTO_RAW)) {
1499                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1500                                 sizeof(struct ipv6hdr));
1501                 goto emsgsize;
1502         }
1503
1504         if (ip6_sk_ignore_df(sk))
1505                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1506         else
1507                 maxnonfragsize = mtu;
1508
1509         if (cork->length + length > maxnonfragsize - headersize) {
1510 emsgsize:
1511                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1512                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1513                 return -EMSGSIZE;
1514         }
1515
1516         /* CHECKSUM_PARTIAL only with no extension headers and when
1517          * we are not going to fragment
1518          */
1519         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1520             headersize == sizeof(struct ipv6hdr) &&
1521             length <= mtu - headersize &&
1522             (!(flags & MSG_MORE) || cork->gso_size) &&
1523             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1524                 csummode = CHECKSUM_PARTIAL;
1525
1526         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1527                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1528                 if (!uarg)
1529                         return -ENOBUFS;
1530                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1531                 if (rt->dst.dev->features & NETIF_F_SG &&
1532                     csummode == CHECKSUM_PARTIAL) {
1533                         paged = true;
1534                 } else {
1535                         uarg->zerocopy = 0;
1536                         skb_zcopy_set(skb, uarg, &extra_uref);
1537                 }
1538         }
1539
1540         /*
1541          * Let's try using as much space as possible.
1542          * Use MTU if total length of the message fits into the MTU.
1543          * Otherwise, we need to reserve fragment header and
1544          * fragment alignment (= 8-15 octects, in total).
1545          *
1546          * Note that we may need to "move" the data from the tail
1547          * of the buffer to the new fragment when we split
1548          * the message.
1549          *
1550          * FIXME: It may be fragmented into multiple chunks
1551          *        at once if non-fragmentable extension headers
1552          *        are too large.
1553          * --yoshfuji
1554          */
1555
1556         cork->length += length;
1557         if (!skb)
1558                 goto alloc_new_skb;
1559
1560         while (length > 0) {
1561                 /* Check if the remaining data fits into current packet. */
1562                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1563                 if (copy < length)
1564                         copy = maxfraglen - skb->len;
1565
1566                 if (copy <= 0) {
1567                         char *data;
1568                         unsigned int datalen;
1569                         unsigned int fraglen;
1570                         unsigned int fraggap;
1571                         unsigned int alloclen, alloc_extra;
1572                         unsigned int pagedlen;
1573 alloc_new_skb:
1574                         /* There's no room in the current skb */
1575                         if (skb)
1576                                 fraggap = skb->len - maxfraglen;
1577                         else
1578                                 fraggap = 0;
1579                         /* update mtu and maxfraglen if necessary */
1580                         if (!skb || !skb_prev)
1581                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1582                                                     fragheaderlen, skb, rt,
1583                                                     orig_mtu);
1584
1585                         skb_prev = skb;
1586
1587                         /*
1588                          * If remaining data exceeds the mtu,
1589                          * we know we need more fragment(s).
1590                          */
1591                         datalen = length + fraggap;
1592
1593                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1594                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1595                         fraglen = datalen + fragheaderlen;
1596                         pagedlen = 0;
1597
1598                         alloc_extra = hh_len;
1599                         alloc_extra += dst_exthdrlen;
1600                         alloc_extra += rt->dst.trailer_len;
1601
1602                         /* We just reserve space for fragment header.
1603                          * Note: this may be overallocation if the message
1604                          * (without MSG_MORE) fits into the MTU.
1605                          */
1606                         alloc_extra += sizeof(struct frag_hdr);
1607
1608                         if ((flags & MSG_MORE) &&
1609                             !(rt->dst.dev->features&NETIF_F_SG))
1610                                 alloclen = mtu;
1611                         else if (!paged &&
1612                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1613                                   !(rt->dst.dev->features & NETIF_F_SG)))
1614                                 alloclen = fraglen;
1615                         else {
1616                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1617                                 pagedlen = fraglen - alloclen;
1618                         }
1619                         alloclen += alloc_extra;
1620
1621                         if (datalen != length + fraggap) {
1622                                 /*
1623                                  * this is not the last fragment, the trailer
1624                                  * space is regarded as data space.
1625                                  */
1626                                 datalen += rt->dst.trailer_len;
1627                         }
1628
1629                         fraglen = datalen + fragheaderlen;
1630
1631                         copy = datalen - transhdrlen - fraggap - pagedlen;
1632                         if (copy < 0) {
1633                                 err = -EINVAL;
1634                                 goto error;
1635                         }
1636                         if (transhdrlen) {
1637                                 skb = sock_alloc_send_skb(sk, alloclen,
1638                                                 (flags & MSG_DONTWAIT), &err);
1639                         } else {
1640                                 skb = NULL;
1641                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1642                                     2 * sk->sk_sndbuf)
1643                                         skb = alloc_skb(alloclen,
1644                                                         sk->sk_allocation);
1645                                 if (unlikely(!skb))
1646                                         err = -ENOBUFS;
1647                         }
1648                         if (!skb)
1649                                 goto error;
1650                         /*
1651                          *      Fill in the control structures
1652                          */
1653                         skb->protocol = htons(ETH_P_IPV6);
1654                         skb->ip_summed = csummode;
1655                         skb->csum = 0;
1656                         /* reserve for fragmentation and ipsec header */
1657                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1658                                     dst_exthdrlen);
1659
1660                         /*
1661                          *      Find where to start putting bytes
1662                          */
1663                         data = skb_put(skb, fraglen - pagedlen);
1664                         skb_set_network_header(skb, exthdrlen);
1665                         data += fragheaderlen;
1666                         skb->transport_header = (skb->network_header +
1667                                                  fragheaderlen);
1668                         if (fraggap) {
1669                                 skb->csum = skb_copy_and_csum_bits(
1670                                         skb_prev, maxfraglen,
1671                                         data + transhdrlen, fraggap);
1672                                 skb_prev->csum = csum_sub(skb_prev->csum,
1673                                                           skb->csum);
1674                                 data += fraggap;
1675                                 pskb_trim_unique(skb_prev, maxfraglen);
1676                         }
1677                         if (copy > 0 &&
1678                             getfrag(from, data + transhdrlen, offset,
1679                                     copy, fraggap, skb) < 0) {
1680                                 err = -EFAULT;
1681                                 kfree_skb(skb);
1682                                 goto error;
1683                         }
1684
1685                         offset += copy;
1686                         length -= copy + transhdrlen;
1687                         transhdrlen = 0;
1688                         exthdrlen = 0;
1689                         dst_exthdrlen = 0;
1690
1691                         /* Only the initial fragment is time stamped */
1692                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1693                         cork->tx_flags = 0;
1694                         skb_shinfo(skb)->tskey = tskey;
1695                         tskey = 0;
1696                         skb_zcopy_set(skb, uarg, &extra_uref);
1697
1698                         if ((flags & MSG_CONFIRM) && !skb_prev)
1699                                 skb_set_dst_pending_confirm(skb, 1);
1700
1701                         /*
1702                          * Put the packet on the pending queue
1703                          */
1704                         if (!skb->destructor) {
1705                                 skb->destructor = sock_wfree;
1706                                 skb->sk = sk;
1707                                 wmem_alloc_delta += skb->truesize;
1708                         }
1709                         __skb_queue_tail(queue, skb);
1710                         continue;
1711                 }
1712
1713                 if (copy > length)
1714                         copy = length;
1715
1716                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1717                     skb_tailroom(skb) >= copy) {
1718                         unsigned int off;
1719
1720                         off = skb->len;
1721                         if (getfrag(from, skb_put(skb, copy),
1722                                                 offset, copy, off, skb) < 0) {
1723                                 __skb_trim(skb, off);
1724                                 err = -EFAULT;
1725                                 goto error;
1726                         }
1727                 } else if (!uarg || !uarg->zerocopy) {
1728                         int i = skb_shinfo(skb)->nr_frags;
1729
1730                         err = -ENOMEM;
1731                         if (!sk_page_frag_refill(sk, pfrag))
1732                                 goto error;
1733
1734                         if (!skb_can_coalesce(skb, i, pfrag->page,
1735                                               pfrag->offset)) {
1736                                 err = -EMSGSIZE;
1737                                 if (i == MAX_SKB_FRAGS)
1738                                         goto error;
1739
1740                                 __skb_fill_page_desc(skb, i, pfrag->page,
1741                                                      pfrag->offset, 0);
1742                                 skb_shinfo(skb)->nr_frags = ++i;
1743                                 get_page(pfrag->page);
1744                         }
1745                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1746                         if (getfrag(from,
1747                                     page_address(pfrag->page) + pfrag->offset,
1748                                     offset, copy, skb->len, skb) < 0)
1749                                 goto error_efault;
1750
1751                         pfrag->offset += copy;
1752                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1753                         skb->len += copy;
1754                         skb->data_len += copy;
1755                         skb->truesize += copy;
1756                         wmem_alloc_delta += copy;
1757                 } else {
1758                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1759                         if (err < 0)
1760                                 goto error;
1761                 }
1762                 offset += copy;
1763                 length -= copy;
1764         }
1765
1766         if (wmem_alloc_delta)
1767                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1768         return 0;
1769
1770 error_efault:
1771         err = -EFAULT;
1772 error:
1773         net_zcopy_put_abort(uarg, extra_uref);
1774         cork->length -= length;
1775         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1776         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1777         return err;
1778 }
1779
1780 int ip6_append_data(struct sock *sk,
1781                     int getfrag(void *from, char *to, int offset, int len,
1782                                 int odd, struct sk_buff *skb),
1783                     void *from, int length, int transhdrlen,
1784                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1785                     struct rt6_info *rt, unsigned int flags)
1786 {
1787         struct inet_sock *inet = inet_sk(sk);
1788         struct ipv6_pinfo *np = inet6_sk(sk);
1789         int exthdrlen;
1790         int err;
1791
1792         if (flags&MSG_PROBE)
1793                 return 0;
1794         if (skb_queue_empty(&sk->sk_write_queue)) {
1795                 /*
1796                  * setup for corking
1797                  */
1798                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1799                                      ipc6, rt, fl6);
1800                 if (err)
1801                         return err;
1802
1803                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1804                 length += exthdrlen;
1805                 transhdrlen += exthdrlen;
1806         } else {
1807                 fl6 = &inet->cork.fl.u.ip6;
1808                 transhdrlen = 0;
1809         }
1810
1811         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1812                                  &np->cork, sk_page_frag(sk), getfrag,
1813                                  from, length, transhdrlen, flags, ipc6);
1814 }
1815 EXPORT_SYMBOL_GPL(ip6_append_data);
1816
1817 static void ip6_cork_release(struct inet_cork_full *cork,
1818                              struct inet6_cork *v6_cork)
1819 {
1820         if (v6_cork->opt) {
1821                 kfree(v6_cork->opt->dst0opt);
1822                 kfree(v6_cork->opt->dst1opt);
1823                 kfree(v6_cork->opt->hopopt);
1824                 kfree(v6_cork->opt->srcrt);
1825                 kfree(v6_cork->opt);
1826                 v6_cork->opt = NULL;
1827         }
1828
1829         if (cork->base.dst) {
1830                 dst_release(cork->base.dst);
1831                 cork->base.dst = NULL;
1832                 cork->base.flags &= ~IPCORK_ALLFRAG;
1833         }
1834         memset(&cork->fl, 0, sizeof(cork->fl));
1835 }
1836
1837 struct sk_buff *__ip6_make_skb(struct sock *sk,
1838                                struct sk_buff_head *queue,
1839                                struct inet_cork_full *cork,
1840                                struct inet6_cork *v6_cork)
1841 {
1842         struct sk_buff *skb, *tmp_skb;
1843         struct sk_buff **tail_skb;
1844         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1845         struct ipv6_pinfo *np = inet6_sk(sk);
1846         struct net *net = sock_net(sk);
1847         struct ipv6hdr *hdr;
1848         struct ipv6_txoptions *opt = v6_cork->opt;
1849         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1850         struct flowi6 *fl6 = &cork->fl.u.ip6;
1851         unsigned char proto = fl6->flowi6_proto;
1852
1853         skb = __skb_dequeue(queue);
1854         if (!skb)
1855                 goto out;
1856         tail_skb = &(skb_shinfo(skb)->frag_list);
1857
1858         /* move skb->data to ip header from ext header */
1859         if (skb->data < skb_network_header(skb))
1860                 __skb_pull(skb, skb_network_offset(skb));
1861         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1862                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1863                 *tail_skb = tmp_skb;
1864                 tail_skb = &(tmp_skb->next);
1865                 skb->len += tmp_skb->len;
1866                 skb->data_len += tmp_skb->len;
1867                 skb->truesize += tmp_skb->truesize;
1868                 tmp_skb->destructor = NULL;
1869                 tmp_skb->sk = NULL;
1870         }
1871
1872         /* Allow local fragmentation. */
1873         skb->ignore_df = ip6_sk_ignore_df(sk);
1874
1875         *final_dst = fl6->daddr;
1876         __skb_pull(skb, skb_network_header_len(skb));
1877         if (opt && opt->opt_flen)
1878                 ipv6_push_frag_opts(skb, opt, &proto);
1879         if (opt && opt->opt_nflen)
1880                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1881
1882         skb_push(skb, sizeof(struct ipv6hdr));
1883         skb_reset_network_header(skb);
1884         hdr = ipv6_hdr(skb);
1885
1886         ip6_flow_hdr(hdr, v6_cork->tclass,
1887                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1888                                         ip6_autoflowlabel(net, np), fl6));
1889         hdr->hop_limit = v6_cork->hop_limit;
1890         hdr->nexthdr = proto;
1891         hdr->saddr = fl6->saddr;
1892         hdr->daddr = *final_dst;
1893
1894         skb->priority = sk->sk_priority;
1895         skb->mark = cork->base.mark;
1896
1897         skb->tstamp = cork->base.transmit_time;
1898
1899         skb_dst_set(skb, dst_clone(&rt->dst));
1900         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1901         if (proto == IPPROTO_ICMPV6) {
1902                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1903
1904                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1905                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1906         }
1907
1908         ip6_cork_release(cork, v6_cork);
1909 out:
1910         return skb;
1911 }
1912
1913 int ip6_send_skb(struct sk_buff *skb)
1914 {
1915         struct net *net = sock_net(skb->sk);
1916         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1917         int err;
1918
1919         err = ip6_local_out(net, skb->sk, skb);
1920         if (err) {
1921                 if (err > 0)
1922                         err = net_xmit_errno(err);
1923                 if (err)
1924                         IP6_INC_STATS(net, rt->rt6i_idev,
1925                                       IPSTATS_MIB_OUTDISCARDS);
1926         }
1927
1928         return err;
1929 }
1930
1931 int ip6_push_pending_frames(struct sock *sk)
1932 {
1933         struct sk_buff *skb;
1934
1935         skb = ip6_finish_skb(sk);
1936         if (!skb)
1937                 return 0;
1938
1939         return ip6_send_skb(skb);
1940 }
1941 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1942
1943 static void __ip6_flush_pending_frames(struct sock *sk,
1944                                        struct sk_buff_head *queue,
1945                                        struct inet_cork_full *cork,
1946                                        struct inet6_cork *v6_cork)
1947 {
1948         struct sk_buff *skb;
1949
1950         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1951                 if (skb_dst(skb))
1952                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1953                                       IPSTATS_MIB_OUTDISCARDS);
1954                 kfree_skb(skb);
1955         }
1956
1957         ip6_cork_release(cork, v6_cork);
1958 }
1959
1960 void ip6_flush_pending_frames(struct sock *sk)
1961 {
1962         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1963                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1964 }
1965 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1966
1967 struct sk_buff *ip6_make_skb(struct sock *sk,
1968                              int getfrag(void *from, char *to, int offset,
1969                                          int len, int odd, struct sk_buff *skb),
1970                              void *from, int length, int transhdrlen,
1971                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1972                              struct rt6_info *rt, unsigned int flags,
1973                              struct inet_cork_full *cork)
1974 {
1975         struct inet6_cork v6_cork;
1976         struct sk_buff_head queue;
1977         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1978         int err;
1979
1980         if (flags & MSG_PROBE)
1981                 return NULL;
1982
1983         __skb_queue_head_init(&queue);
1984
1985         cork->base.flags = 0;
1986         cork->base.addr = 0;
1987         cork->base.opt = NULL;
1988         cork->base.dst = NULL;
1989         v6_cork.opt = NULL;
1990         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1991         if (err) {
1992                 ip6_cork_release(cork, &v6_cork);
1993                 return ERR_PTR(err);
1994         }
1995         if (ipc6->dontfrag < 0)
1996                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1997
1998         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1999                                 &current->task_frag, getfrag, from,
2000                                 length + exthdrlen, transhdrlen + exthdrlen,
2001                                 flags, ipc6);
2002         if (err) {
2003                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2004                 return ERR_PTR(err);
2005         }
2006
2007         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2008 }