net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct inet6_dev *idev = ip6_dst_idev(dst);
  64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65         const struct in6_addr *daddr, *nexthop;
  66         struct ipv6hdr *hdr;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         /* Be paranoid, rather than too clever. */
  71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                 skb = skb_expand_head(skb, hh_len);
  73                 if (!skb) {
  74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                         return -ENOMEM;
  76                 }
  77         }
  78
  79         hdr = ipv6_hdr(skb);
  80         daddr = &hdr->daddr;
  81         if (ipv6_addr_is_multicast(daddr)) {
  82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                     ((mroute6_is_socket(net, skb) &&
  84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                         /* Do not check for IFF_ALLMULTI; multicast routing
  89                            is not supported in any case.
  90                          */
  91                         if (newskb)
  92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                         net, sk, newskb, NULL, newskb->dev,
  94                                         dev_loopback_xmit);
  95
  96                         if (hdr->hop_limit == 0) {
  97                                 IP6_INC_STATS(net, idev,
  98                                               IPSTATS_MIB_OUTDISCARDS);
  99                                 kfree_skb(skb);
 100                                 return 0;
 101                         }
 102                 }
 103
 104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                     !(dev->flags & IFF_LOOPBACK)) {
 107                         kfree_skb(skb);
 108                         return 0;
 109                 }
 110         }
 111
 112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                 int res = lwtunnel_xmit(skb);
 114
 115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                         return res;
 117         }
 118
 119         rcu_read_lock_bh();
 120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122         if (unlikely(!neigh))
 123                 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124         if (!IS_ERR(neigh)) {
 125                 sock_confirm_neigh(skb, neigh);
 126                 ret = neigh_output(neigh, skb, false);
 127                 rcu_read_unlock_bh();
 128                 return ret;
 129         }
 130         rcu_read_unlock_bh();
 131
 132         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb(skb);
 134         return -EINVAL;
 135 }
 136
 137 static int
 138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                     struct sk_buff *skb, unsigned int mtu)
 140 {
 141         struct sk_buff *segs, *nskb;
 142         netdev_features_t features;
 143         int ret = 0;
 144
 145         /* Please see corresponding comment in ip_finish_output_gso
 146          * describing the cases where GSO segment length exceeds the
 147          * egress MTU.
 148          */
 149         features = netif_skb_features(skb);
 150         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151         if (IS_ERR_OR_NULL(segs)) {
 152                 kfree_skb(skb);
 153                 return -ENOMEM;
 154         }
 155
 156         consume_skb(skb);
 157
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 int err;
 160
 161                 skb_mark_not_on_list(segs);
 162                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                 if (err && ret == 0)
 164                         ret = err;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171 {
 172         unsigned int mtu;
 173
 174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175         /* Policy lookup after SNAT yielded a new policy */
 176         if (skb_dst(skb)->xfrm) {
 177                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 178                 return dst_output(net, sk, skb);
 179         }
 180 #endif
 181
 182         mtu = ip6_skb_dst_mtu(skb);
 183         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187             dst_allfrag(skb_dst(skb)) ||
 188             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190         else
 191                 return ip6_finish_output2(net, sk, skb);
 192 }
 193
 194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195 {
 196         int ret;
 197
 198         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199         switch (ret) {
 200         case NET_XMIT_SUCCESS:
 201                 return __ip6_finish_output(net, sk, skb);
 202         case NET_XMIT_CN:
 203                 return __ip6_finish_output(net, sk, skb) ? : ret;
 204         default:
 205                 kfree_skb(skb);
 206                 return ret;
 207         }
 208 }
 209
 210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211 {
 212         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215         skb->protocol = htons(ETH_P_IPV6);
 216         skb->dev = dev;
 217
 218         if (unlikely(idev->cnf.disable_ipv6)) {
 219                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                 kfree_skb(skb);
 221                 return 0;
 222         }
 223
 224         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                             net, sk, skb, indev, dev,
 226                             ip6_finish_output,
 227                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228 }
 229 EXPORT_SYMBOL(ip6_output);
 230
 231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232 {
 233         if (!np->autoflowlabel_set)
 234                 return ip6_default_np_autolabel(net);
 235         else
 236                 return np->autoflowlabel;
 237 }
 238
 239 /*
 240  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241  * Note : socket lock is not held for SYNACK packets, but might be modified
 242  * by calls to skb_set_owner_w() and ipv6_local_error(),
 243  * which are using proper atomic operations or spinlocks.
 244  */
 245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247 {
 248         struct net *net = sock_net(sk);
 249         const struct ipv6_pinfo *np = inet6_sk(sk);
 250         struct in6_addr *first_hop = &fl6->daddr;
 251         struct dst_entry *dst = skb_dst(skb);
 252         struct net_device *dev = dst->dev;
 253         struct inet6_dev *idev = ip6_dst_idev(dst);
 254         unsigned int head_room;
 255         struct ipv6hdr *hdr;
 256         u8  proto = fl6->flowi6_proto;
 257         int seg_len = skb->len;
 258         int hlimit = -1;
 259         u32 mtu;
 260
 261         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262         if (opt)
 263                 head_room += opt->opt_nflen + opt->opt_flen;
 264
 265         if (unlikely(head_room > skb_headroom(skb))) {
 266                 skb = skb_expand_head(skb, head_room);
 267                 if (!skb) {
 268                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                         return -ENOBUFS;
 270                 }
 271         }
 272
 273         if (opt) {
 274                 seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                 if (opt->opt_flen)
 277                         ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                 if (opt->opt_nflen)
 280                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                              &fl6->saddr);
 282         }
 283
 284         skb_push(skb, sizeof(struct ipv6hdr));
 285         skb_reset_network_header(skb);
 286         hdr = ipv6_hdr(skb);
 287
 288         /*
 289          *      Fill in the IPv6 header
 290          */
 291         if (np)
 292                 hlimit = np->hop_limit;
 293         if (hlimit < 0)
 294                 hlimit = ip6_dst_hoplimit(dst);
 295
 296         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                 ip6_autoflowlabel(net, np), fl6));
 298
 299         hdr->payload_len = htons(seg_len);
 300         hdr->nexthdr = proto;
 301         hdr->hop_limit = hlimit;
 302
 303         hdr->saddr = fl6->saddr;
 304         hdr->daddr = *first_hop;
 305
 306         skb->protocol = htons(ETH_P_IPV6);
 307         skb->priority = priority;
 308         skb->mark = mark;
 309
 310         mtu = dst_mtu(dst);
 311         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                 /* if egress device is enslaved to an L3 master device pass the
 315                  * skb to its handler for processing
 316                  */
 317                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                 if (unlikely(!skb))
 319                         return 0;
 320
 321                 /* hooks should never assume socket lock is held.
 322                  * we promote our socket to non const
 323                  */
 324                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                                net, (struct sock *)sk, skb, NULL, dev,
 326                                dst_output);
 327         }
 328
 329         skb->dev = dev;
 330         /* ipv6_local_error() does not require socket lock,
 331          * we promote our socket to non const
 332          */
 333         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336         kfree_skb(skb);
 337         return -EMSGSIZE;
 338 }
 339 EXPORT_SYMBOL(ip6_xmit);
 340
 341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342 {
 343         struct ip6_ra_chain *ra;
 344         struct sock *last = NULL;
 345
 346         read_lock(&ip6_ra_lock);
 347         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                 struct sock *sk = ra->sk;
 349                 if (sk && ra->sel == sel &&
 350                     (!sk->sk_bound_dev_if ||
 351                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                         struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                         if (np && np->rtalert_isolate &&
 355                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                 continue;
 357                         }
 358                         if (last) {
 359                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                 if (skb2)
 361                                         rawv6_rcv(last, skb2);
 362                         }
 363                         last = sk;
 364                 }
 365         }
 366
 367         if (last) {
 368                 rawv6_rcv(last, skb);
 369                 read_unlock(&ip6_ra_lock);
 370                 return 1;
 371         }
 372         read_unlock(&ip6_ra_lock);
 373         return 0;
 374 }
 375
 376 static int ip6_forward_proxy_check(struct sk_buff *skb)
 377 {
 378         struct ipv6hdr *hdr = ipv6_hdr(skb);
 379         u8 nexthdr = hdr->nexthdr;
 380         __be16 frag_off;
 381         int offset;
 382
 383         if (ipv6_ext_hdr(nexthdr)) {
 384                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                 if (offset < 0)
 386                         return 0;
 387         } else
 388                 offset = sizeof(struct ipv6hdr);
 389
 390         if (nexthdr == IPPROTO_ICMPV6) {
 391                 struct icmp6hdr *icmp6;
 392
 393                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                          offset + 1 - skb->data)))
 395                         return 0;
 396
 397                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                 switch (icmp6->icmp6_type) {
 400                 case NDISC_ROUTER_SOLICITATION:
 401                 case NDISC_ROUTER_ADVERTISEMENT:
 402                 case NDISC_NEIGHBOUR_SOLICITATION:
 403                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                 case NDISC_REDIRECT:
 405                         /* For reaction involving unicast neighbor discovery
 406                          * message destined to the proxied address, pass it to
 407                          * input function.
 408                          */
 409                         return 1;
 410                 default:
 411                         break;
 412                 }
 413         }
 414
 415         /*
 416          * The proxying router can't forward traffic sent to a link-local
 417          * address, so signal the sender and discard the packet. This
 418          * behavior is clarified by the MIPv6 specification.
 419          */
 420         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                 dst_link_failure(skb);
 422                 return -1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                      struct sk_buff *skb)
 430 {
 431         struct dst_entry *dst = skb_dst(skb);
 432
 433         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436 #ifdef CONFIG_NET_SWITCHDEV
 437         if (skb->offload_l3_fwd_mark) {
 438                 consume_skb(skb);
 439                 return 0;
 440         }
 441 #endif
 442
 443         skb->tstamp = 0;
 444         return dst_output(net, sk, skb);
 445 }
 446
 447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448 {
 449         if (skb->len <= mtu)
 450                 return false;
 451
 452         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                 return true;
 455
 456         if (skb->ignore_df)
 457                 return false;
 458
 459         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                 return false;
 461
 462         return true;
 463 }
 464
 465 int ip6_forward(struct sk_buff *skb)
 466 {
 467         struct dst_entry *dst = skb_dst(skb);
 468         struct ipv6hdr *hdr = ipv6_hdr(skb);
 469         struct inet6_skb_parm *opt = IP6CB(skb);
 470         struct net *net = dev_net(dst->dev);
 471         struct inet6_dev *idev;
 472         u32 mtu;
 473
 474         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 475         if (net->ipv6.devconf_all->forwarding == 0)
 476                 goto error;
 477
 478         if (skb->pkt_type != PACKET_HOST)
 479                 goto drop;
 480
 481         if (unlikely(skb->sk))
 482                 goto drop;
 483
 484         if (skb_warn_if_lro(skb))
 485                 goto drop;
 486
 487         if (!net->ipv6.devconf_all->disable_policy &&
 488             (!idev || !idev->cnf.disable_policy) &&
 489             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 490                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 491                 goto drop;
 492         }
 493
 494         skb_forward_csum(skb);
 495
 496         /*
 497          *      We DO NOT make any processing on
 498          *      RA packets, pushing them to user level AS IS
 499          *      without ane WARRANTY that application will be able
 500          *      to interpret them. The reason is that we
 501          *      cannot make anything clever here.
 502          *
 503          *      We are not end-node, so that if packet contains
 504          *      AH/ESP, we cannot make anything.
 505          *      Defragmentation also would be mistake, RA packets
 506          *      cannot be fragmented, because there is no warranty
 507          *      that different fragments will go along one path. --ANK
 508          */
 509         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 510                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 511                         return 0;
 512         }
 513
 514         /*
 515          *      check and decrement ttl
 516          */
 517         if (hdr->hop_limit <= 1) {
 518                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 520
 521                 kfree_skb(skb);
 522                 return -ETIMEDOUT;
 523         }
 524
 525         /* XXX: idev->cnf.proxy_ndp? */
 526         if (net->ipv6.devconf_all->proxy_ndp &&
 527             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 528                 int proxied = ip6_forward_proxy_check(skb);
 529                 if (proxied > 0) {
 530                         hdr->hop_limit--;
 531                         return ip6_input(skb);
 532                 } else if (proxied < 0) {
 533                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 534                         goto drop;
 535                 }
 536         }
 537
 538         if (!xfrm6_route_forward(skb)) {
 539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 540                 goto drop;
 541         }
 542         dst = skb_dst(skb);
 543
 544         /* IPv6 specs say nothing about it, but it is clear that we cannot
 545            send redirects to source routed frames.
 546            We don't send redirects to frames decapsulated from IPsec.
 547          */
 548         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 549             opt->srcrt == 0 && !skb_sec_path(skb)) {
 550                 struct in6_addr *target = NULL;
 551                 struct inet_peer *peer;
 552                 struct rt6_info *rt;
 553
 554                 /*
 555                  *      incoming and outgoing devices are the same
 556                  *      send a redirect.
 557                  */
 558
 559                 rt = (struct rt6_info *) dst;
 560                 if (rt->rt6i_flags & RTF_GATEWAY)
 561                         target = &rt->rt6i_gateway;
 562                 else
 563                         target = &hdr->daddr;
 564
 565                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 566
 567                 /* Limit redirects both by destination (here)
 568                    and by source (inside ndisc_send_redirect)
 569                  */
 570                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 571                         ndisc_send_redirect(skb, target);
 572                 if (peer)
 573                         inet_putpeer(peer);
 574         } else {
 575                 int addrtype = ipv6_addr_type(&hdr->saddr);
 576
 577                 /* This check is security critical. */
 578                 if (addrtype == IPV6_ADDR_ANY ||
 579                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 580                         goto error;
 581                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 582                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 583                                     ICMPV6_NOT_NEIGHBOUR, 0);
 584                         goto error;
 585                 }
 586         }
 587
 588         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 589         if (mtu < IPV6_MIN_MTU)
 590                 mtu = IPV6_MIN_MTU;
 591
 592         if (ip6_pkt_too_big(skb, mtu)) {
 593                 /* Again, force OUTPUT device used as source address */
 594                 skb->dev = dst->dev;
 595                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 596                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 597                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 598                                 IPSTATS_MIB_FRAGFAILS);
 599                 kfree_skb(skb);
 600                 return -EMSGSIZE;
 601         }
 602
 603         if (skb_cow(skb, dst->dev->hard_header_len)) {
 604                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 605                                 IPSTATS_MIB_OUTDISCARDS);
 606                 goto drop;
 607         }
 608
 609         hdr = ipv6_hdr(skb);
 610
 611         /* Mangling hops number delayed to point after skb COW */
 612
 613         hdr->hop_limit--;
 614
 615         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 616                        net, NULL, skb, skb->dev, dst->dev,
 617                        ip6_forward_finish);
 618
 619 error:
 620         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 621 drop:
 622         kfree_skb(skb);
 623         return -EINVAL;
 624 }
 625
 626 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 627 {
 628         to->pkt_type = from->pkt_type;
 629         to->priority = from->priority;
 630         to->protocol = from->protocol;
 631         skb_dst_drop(to);
 632         skb_dst_set(to, dst_clone(skb_dst(from)));
 633         to->dev = from->dev;
 634         to->mark = from->mark;
 635
 636         skb_copy_hash(to, from);
 637
 638 #ifdef CONFIG_NET_SCHED
 639         to->tc_index = from->tc_index;
 640 #endif
 641         nf_copy(to, from);
 642         skb_ext_copy(to, from);
 643         skb_copy_secmark(to, from);
 644 }
 645
 646 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 647                       u8 nexthdr, __be32 frag_id,
 648                       struct ip6_fraglist_iter *iter)
 649 {
 650         unsigned int first_len;
 651         struct frag_hdr *fh;
 652
 653         /* BUILD HEADER */
 654         *prevhdr = NEXTHDR_FRAGMENT;
 655         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 656         if (!iter->tmp_hdr)
 657                 return -ENOMEM;
 658
 659         iter->frag = skb_shinfo(skb)->frag_list;
 660         skb_frag_list_init(skb);
 661
 662         iter->offset = 0;
 663         iter->hlen = hlen;
 664         iter->frag_id = frag_id;
 665         iter->nexthdr = nexthdr;
 666
 667         __skb_pull(skb, hlen);
 668         fh = __skb_push(skb, sizeof(struct frag_hdr));
 669         __skb_push(skb, hlen);
 670         skb_reset_network_header(skb);
 671         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 672
 673         fh->nexthdr = nexthdr;
 674         fh->reserved = 0;
 675         fh->frag_off = htons(IP6_MF);
 676         fh->identification = frag_id;
 677
 678         first_len = skb_pagelen(skb);
 679         skb->data_len = first_len - skb_headlen(skb);
 680         skb->len = first_len;
 681         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 682
 683         return 0;
 684 }
 685 EXPORT_SYMBOL(ip6_fraglist_init);
 686
 687 void ip6_fraglist_prepare(struct sk_buff *skb,
 688                           struct ip6_fraglist_iter *iter)
 689 {
 690         struct sk_buff *frag = iter->frag;
 691         unsigned int hlen = iter->hlen;
 692         struct frag_hdr *fh;
 693
 694         frag->ip_summed = CHECKSUM_NONE;
 695         skb_reset_transport_header(frag);
 696         fh = __skb_push(frag, sizeof(struct frag_hdr));
 697         __skb_push(frag, hlen);
 698         skb_reset_network_header(frag);
 699         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 700         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 701         fh->nexthdr = iter->nexthdr;
 702         fh->reserved = 0;
 703         fh->frag_off = htons(iter->offset);
 704         if (frag->next)
 705                 fh->frag_off |= htons(IP6_MF);
 706         fh->identification = iter->frag_id;
 707         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 708         ip6_copy_metadata(frag, skb);
 709 }
 710 EXPORT_SYMBOL(ip6_fraglist_prepare);
 711
 712 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 713                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 714                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 715 {
 716         state->prevhdr = prevhdr;
 717         state->nexthdr = nexthdr;
 718         state->frag_id = frag_id;
 719
 720         state->hlen = hlen;
 721         state->mtu = mtu;
 722
 723         state->left = skb->len - hlen;  /* Space per frame */
 724         state->ptr = hlen;              /* Where to start from */
 725
 726         state->hroom = hdr_room;
 727         state->troom = needed_tailroom;
 728
 729         state->offset = 0;
 730 }
 731 EXPORT_SYMBOL(ip6_frag_init);
 732
 733 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 734 {
 735         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 736         struct sk_buff *frag;
 737         struct frag_hdr *fh;
 738         unsigned int len;
 739
 740         len = state->left;
 741         /* IF: it doesn't fit, use 'mtu' - the data space left */
 742         if (len > state->mtu)
 743                 len = state->mtu;
 744         /* IF: we are not sending up to and including the packet end
 745            then align the next start on an eight byte boundary */
 746         if (len < state->left)
 747                 len &= ~7;
 748
 749         /* Allocate buffer */
 750         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 751                          state->hroom + state->troom, GFP_ATOMIC);
 752         if (!frag)
 753                 return ERR_PTR(-ENOMEM);
 754
 755         /*
 756          *      Set up data on packet
 757          */
 758
 759         ip6_copy_metadata(frag, skb);
 760         skb_reserve(frag, state->hroom);
 761         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 762         skb_reset_network_header(frag);
 763         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 764         frag->transport_header = (frag->network_header + state->hlen +
 765                                   sizeof(struct frag_hdr));
 766
 767         /*
 768          *      Charge the memory for the fragment to any owner
 769          *      it might possess
 770          */
 771         if (skb->sk)
 772                 skb_set_owner_w(frag, skb->sk);
 773
 774         /*
 775          *      Copy the packet header into the new buffer.
 776          */
 777         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 778
 779         fragnexthdr_offset = skb_network_header(frag);
 780         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 781         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 782
 783         /*
 784          *      Build fragment header.
 785          */
 786         fh->nexthdr = state->nexthdr;
 787         fh->reserved = 0;
 788         fh->identification = state->frag_id;
 789
 790         /*
 791          *      Copy a block of the IP datagram.
 792          */
 793         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 794                              len));
 795         state->left -= len;
 796
 797         fh->frag_off = htons(state->offset);
 798         if (state->left > 0)
 799                 fh->frag_off |= htons(IP6_MF);
 800         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 801
 802         state->ptr += len;
 803         state->offset += len;
 804
 805         return frag;
 806 }
 807 EXPORT_SYMBOL(ip6_frag_next);
 808
 809 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 810                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 811 {
 812         struct sk_buff *frag;
 813         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 814         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 815                                 inet6_sk(skb->sk) : NULL;
 816         struct ip6_frag_state state;
 817         unsigned int mtu, hlen, nexthdr_offset;
 818         ktime_t tstamp = skb->tstamp;
 819         int hroom, err = 0;
 820         __be32 frag_id;
 821         u8 *prevhdr, nexthdr = 0;
 822
 823         err = ip6_find_1stfragopt(skb, &prevhdr);
 824         if (err < 0)
 825                 goto fail;
 826         hlen = err;
 827         nexthdr = *prevhdr;
 828         nexthdr_offset = prevhdr - skb_network_header(skb);
 829
 830         mtu = ip6_skb_dst_mtu(skb);
 831
 832         /* We must not fragment if the socket is set to force MTU discovery
 833          * or if the skb it not generated by a local socket.
 834          */
 835         if (unlikely(!skb->ignore_df && skb->len > mtu))
 836                 goto fail_toobig;
 837
 838         if (IP6CB(skb)->frag_max_size) {
 839                 if (IP6CB(skb)->frag_max_size > mtu)
 840                         goto fail_toobig;
 841
 842                 /* don't send fragments larger than what we received */
 843                 mtu = IP6CB(skb)->frag_max_size;
 844                 if (mtu < IPV6_MIN_MTU)
 845                         mtu = IPV6_MIN_MTU;
 846         }
 847
 848         if (np && np->frag_size < mtu) {
 849                 if (np->frag_size)
 850                         mtu = np->frag_size;
 851         }
 852         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 853                 goto fail_toobig;
 854         mtu -= hlen + sizeof(struct frag_hdr);
 855
 856         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 857                                     &ipv6_hdr(skb)->saddr);
 858
 859         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 860             (err = skb_checksum_help(skb)))
 861                 goto fail;
 862
 863         prevhdr = skb_network_header(skb) + nexthdr_offset;
 864         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 865         if (skb_has_frag_list(skb)) {
 866                 unsigned int first_len = skb_pagelen(skb);
 867                 struct ip6_fraglist_iter iter;
 868                 struct sk_buff *frag2;
 869
 870                 if (first_len - hlen > mtu ||
 871                     ((first_len - hlen) & 7) ||
 872                     skb_cloned(skb) ||
 873                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 874                         goto slow_path;
 875
 876                 skb_walk_frags(skb, frag) {
 877                         /* Correct geometry. */
 878                         if (frag->len > mtu ||
 879                             ((frag->len & 7) && frag->next) ||
 880                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 881                                 goto slow_path_clean;
 882
 883                         /* Partially cloned skb? */
 884                         if (skb_shared(frag))
 885                                 goto slow_path_clean;
 886
 887                         BUG_ON(frag->sk);
 888                         if (skb->sk) {
 889                                 frag->sk = skb->sk;
 890                                 frag->destructor = sock_wfree;
 891                         }
 892                         skb->truesize -= frag->truesize;
 893                 }
 894
 895                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 896                                         &iter);
 897                 if (err < 0)
 898                         goto fail;
 899
 900                 for (;;) {
 901                         /* Prepare header of the next frame,
 902                          * before previous one went down. */
 903                         if (iter.frag)
 904                                 ip6_fraglist_prepare(skb, &iter);
 905
 906                         skb->tstamp = tstamp;
 907                         err = output(net, sk, skb);
 908                         if (!err)
 909                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 910                                               IPSTATS_MIB_FRAGCREATES);
 911
 912                         if (err || !iter.frag)
 913                                 break;
 914
 915                         skb = ip6_fraglist_next(&iter);
 916                 }
 917
 918                 kfree(iter.tmp_hdr);
 919
 920                 if (err == 0) {
 921                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 922                                       IPSTATS_MIB_FRAGOKS);
 923                         return 0;
 924                 }
 925
 926                 kfree_skb_list(iter.frag);
 927
 928                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 929                               IPSTATS_MIB_FRAGFAILS);
 930                 return err;
 931
 932 slow_path_clean:
 933                 skb_walk_frags(skb, frag2) {
 934                         if (frag2 == frag)
 935                                 break;
 936                         frag2->sk = NULL;
 937                         frag2->destructor = NULL;
 938                         skb->truesize += frag2->truesize;
 939                 }
 940         }
 941
 942 slow_path:
 943         /*
 944          *      Fragment the datagram.
 945          */
 946
 947         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 948                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 949                       &state);
 950
 951         /*
 952          *      Keep copying data until we run out.
 953          */
 954
 955         while (state.left > 0) {
 956                 frag = ip6_frag_next(skb, &state);
 957                 if (IS_ERR(frag)) {
 958                         err = PTR_ERR(frag);
 959                         goto fail;
 960                 }
 961
 962                 /*
 963                  *      Put this fragment into the sending queue.
 964                  */
 965                 frag->tstamp = tstamp;
 966                 err = output(net, sk, frag);
 967                 if (err)
 968                         goto fail;
 969
 970                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 971                               IPSTATS_MIB_FRAGCREATES);
 972         }
 973         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 974                       IPSTATS_MIB_FRAGOKS);
 975         consume_skb(skb);
 976         return err;
 977
 978 fail_toobig:
 979         if (skb->sk && dst_allfrag(skb_dst(skb)))
 980                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 981
 982         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 983         err = -EMSGSIZE;
 984
 985 fail:
 986         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 987                       IPSTATS_MIB_FRAGFAILS);
 988         kfree_skb(skb);
 989         return err;
 990 }
 991
 992 static inline int ip6_rt_check(const struct rt6key *rt_key,
 993                                const struct in6_addr *fl_addr,
 994                                const struct in6_addr *addr_cache)
 995 {
 996         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 997                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 998 }
 999
1000 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1001                                           struct dst_entry *dst,
1002                                           const struct flowi6 *fl6)
1003 {
1004         struct ipv6_pinfo *np = inet6_sk(sk);
1005         struct rt6_info *rt;
1006
1007         if (!dst)
1008                 goto out;
1009
1010         if (dst->ops->family != AF_INET6) {
1011                 dst_release(dst);
1012                 return NULL;
1013         }
1014
1015         rt = (struct rt6_info *)dst;
1016         /* Yes, checking route validity in not connected
1017          * case is not very simple. Take into account,
1018          * that we do not support routing by source, TOS,
1019          * and MSG_DONTROUTE            --ANK (980726)
1020          *
1021          * 1. ip6_rt_check(): If route was host route,
1022          *    check that cached destination is current.
1023          *    If it is network route, we still may
1024          *    check its validity using saved pointer
1025          *    to the last used address: daddr_cache.
1026          *    We do not want to save whole address now,
1027          *    (because main consumer of this service
1028          *    is tcp, which has not this problem),
1029          *    so that the last trick works only on connected
1030          *    sockets.
1031          * 2. oif also should be the same.
1032          */
1033         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1034 #ifdef CONFIG_IPV6_SUBTREES
1035             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1036 #endif
1037            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1038               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1039                 dst_release(dst);
1040                 dst = NULL;
1041         }
1042
1043 out:
1044         return dst;
1045 }
1046
1047 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1048                                struct dst_entry **dst, struct flowi6 *fl6)
1049 {
1050 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1051         struct neighbour *n;
1052         struct rt6_info *rt;
1053 #endif
1054         int err;
1055         int flags = 0;
1056
1057         /* The correct way to handle this would be to do
1058          * ip6_route_get_saddr, and then ip6_route_output; however,
1059          * the route-specific preferred source forces the
1060          * ip6_route_output call _before_ ip6_route_get_saddr.
1061          *
1062          * In source specific routing (no src=any default route),
1063          * ip6_route_output will fail given src=any saddr, though, so
1064          * that's why we try it again later.
1065          */
1066         if (ipv6_addr_any(&fl6->saddr)) {
1067                 struct fib6_info *from;
1068                 struct rt6_info *rt;
1069
1070                 *dst = ip6_route_output(net, sk, fl6);
1071                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1072
1073                 rcu_read_lock();
1074                 from = rt ? rcu_dereference(rt->from) : NULL;
1075                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1076                                           sk ? inet6_sk(sk)->srcprefs : 0,
1077                                           &fl6->saddr);
1078                 rcu_read_unlock();
1079
1080                 if (err)
1081                         goto out_err_release;
1082
1083                 /* If we had an erroneous initial result, pretend it
1084                  * never existed and let the SA-enabled version take
1085                  * over.
1086                  */
1087                 if ((*dst)->error) {
1088                         dst_release(*dst);
1089                         *dst = NULL;
1090                 }
1091
1092                 if (fl6->flowi6_oif)
1093                         flags |= RT6_LOOKUP_F_IFACE;
1094         }
1095
1096         if (!*dst)
1097                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1098
1099         err = (*dst)->error;
1100         if (err)
1101                 goto out_err_release;
1102
1103 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1104         /*
1105          * Here if the dst entry we've looked up
1106          * has a neighbour entry that is in the INCOMPLETE
1107          * state and the src address from the flow is
1108          * marked as OPTIMISTIC, we release the found
1109          * dst entry and replace it instead with the
1110          * dst entry of the nexthop router
1111          */
1112         rt = (struct rt6_info *) *dst;
1113         rcu_read_lock_bh();
1114         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1115                                       rt6_nexthop(rt, &fl6->daddr));
1116         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1117         rcu_read_unlock_bh();
1118
1119         if (err) {
1120                 struct inet6_ifaddr *ifp;
1121                 struct flowi6 fl_gw6;
1122                 int redirect;
1123
1124                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1125                                       (*dst)->dev, 1);
1126
1127                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1128                 if (ifp)
1129                         in6_ifa_put(ifp);
1130
1131                 if (redirect) {
1132                         /*
1133                          * We need to get the dst entry for the
1134                          * default router instead
1135                          */
1136                         dst_release(*dst);
1137                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1138                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1139                         *dst = ip6_route_output(net, sk, &fl_gw6);
1140                         err = (*dst)->error;
1141                         if (err)
1142                                 goto out_err_release;
1143                 }
1144         }
1145 #endif
1146         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1147             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1148                 err = -EAFNOSUPPORT;
1149                 goto out_err_release;
1150         }
1151
1152         return 0;
1153
1154 out_err_release:
1155         dst_release(*dst);
1156         *dst = NULL;
1157
1158         if (err == -ENETUNREACH)
1159                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1160         return err;
1161 }
1162
1163 /**
1164  *      ip6_dst_lookup - perform route lookup on flow
1165  *      @net: Network namespace to perform lookup in
1166  *      @sk: socket which provides route info
1167  *      @dst: pointer to dst_entry * for result
1168  *      @fl6: flow to lookup
1169  *
1170  *      This function performs a route lookup on the given flow.
1171  *
1172  *      It returns zero on success, or a standard errno code on error.
1173  */
1174 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1175                    struct flowi6 *fl6)
1176 {
1177         *dst = NULL;
1178         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1179 }
1180 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1181
1182 /**
1183  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1184  *      @net: Network namespace to perform lookup in
1185  *      @sk: socket which provides route info
1186  *      @fl6: flow to lookup
1187  *      @final_dst: final destination address for ipsec lookup
1188  *
1189  *      This function performs a route lookup on the given flow.
1190  *
1191  *      It returns a valid dst pointer on success, or a pointer encoded
1192  *      error code.
1193  */
1194 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1195                                       const struct in6_addr *final_dst)
1196 {
1197         struct dst_entry *dst = NULL;
1198         int err;
1199
1200         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1201         if (err)
1202                 return ERR_PTR(err);
1203         if (final_dst)
1204                 fl6->daddr = *final_dst;
1205
1206         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1207 }
1208 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1209
1210 /**
1211  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1212  *      @sk: socket which provides the dst cache and route info
1213  *      @fl6: flow to lookup
1214  *      @final_dst: final destination address for ipsec lookup
1215  *      @connected: whether @sk is connected or not
1216  *
1217  *      This function performs a route lookup on the given flow with the
1218  *      possibility of using the cached route in the socket if it is valid.
1219  *      It will take the socket dst lock when operating on the dst cache.
1220  *      As a result, this function can only be used in process context.
1221  *
1222  *      In addition, for a connected socket, cache the dst in the socket
1223  *      if the current cache is not valid.
1224  *
1225  *      It returns a valid dst pointer on success, or a pointer encoded
1226  *      error code.
1227  */
1228 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1229                                          const struct in6_addr *final_dst,
1230                                          bool connected)
1231 {
1232         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1233
1234         dst = ip6_sk_dst_check(sk, dst, fl6);
1235         if (dst)
1236                 return dst;
1237
1238         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1239         if (connected && !IS_ERR(dst))
1240                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1241
1242         return dst;
1243 }
1244 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1245
1246 /**
1247  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1248  *      @skb: Packet for which lookup is done
1249  *      @dev: Tunnel device
1250  *      @net: Network namespace of tunnel device
1251  *      @sock: Socket which provides route info
1252  *      @saddr: Memory to store the src ip address
1253  *      @info: Tunnel information
1254  *      @protocol: IP protocol
1255  *      @use_cache: Flag to enable cache usage
1256  *      This function performs a route lookup on a tunnel
1257  *
1258  *      It returns a valid dst pointer and stores src address to be used in
1259  *      tunnel in param saddr on success, else a pointer encoded error code.
1260  */
1261
1262 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1263                                         struct net_device *dev,
1264                                         struct net *net,
1265                                         struct socket *sock,
1266                                         struct in6_addr *saddr,
1267                                         const struct ip_tunnel_info *info,
1268                                         u8 protocol,
1269                                         bool use_cache)
1270 {
1271         struct dst_entry *dst = NULL;
1272 #ifdef CONFIG_DST_CACHE
1273         struct dst_cache *dst_cache;
1274 #endif
1275         struct flowi6 fl6;
1276         __u8 prio;
1277
1278 #ifdef CONFIG_DST_CACHE
1279         dst_cache = (struct dst_cache *)&info->dst_cache;
1280         if (use_cache) {
1281                 dst = dst_cache_get_ip6(dst_cache, saddr);
1282                 if (dst)
1283                         return dst;
1284         }
1285 #endif
1286         memset(&fl6, 0, sizeof(fl6));
1287         fl6.flowi6_mark = skb->mark;
1288         fl6.flowi6_proto = protocol;
1289         fl6.daddr = info->key.u.ipv6.dst;
1290         fl6.saddr = info->key.u.ipv6.src;
1291         prio = info->key.tos;
1292         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1293
1294         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1295                                               NULL);
1296         if (IS_ERR(dst)) {
1297                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1298                 return ERR_PTR(-ENETUNREACH);
1299         }
1300         if (dst->dev == dev) { /* is this necessary? */
1301                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1302                 dst_release(dst);
1303                 return ERR_PTR(-ELOOP);
1304         }
1305 #ifdef CONFIG_DST_CACHE
1306         if (use_cache)
1307                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1308 #endif
1309         *saddr = fl6.saddr;
1310         return dst;
1311 }
1312 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1313
1314 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1315                                                gfp_t gfp)
1316 {
1317         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1318 }
1319
1320 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1321                                                 gfp_t gfp)
1322 {
1323         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1324 }
1325
1326 static void ip6_append_data_mtu(unsigned int *mtu,
1327                                 int *maxfraglen,
1328                                 unsigned int fragheaderlen,
1329                                 struct sk_buff *skb,
1330                                 struct rt6_info *rt,
1331                                 unsigned int orig_mtu)
1332 {
1333         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1334                 if (!skb) {
1335                         /* first fragment, reserve header_len */
1336                         *mtu = orig_mtu - rt->dst.header_len;
1337
1338                 } else {
1339                         /*
1340                          * this fragment is not first, the headers
1341                          * space is regarded as data space.
1342                          */
1343                         *mtu = orig_mtu;
1344                 }
1345                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1346                               + fragheaderlen - sizeof(struct frag_hdr);
1347         }
1348 }
1349
1350 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1351                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1352                           struct rt6_info *rt, struct flowi6 *fl6)
1353 {
1354         struct ipv6_pinfo *np = inet6_sk(sk);
1355         unsigned int mtu;
1356         struct ipv6_txoptions *opt = ipc6->opt;
1357
1358         /*
1359          * setup for corking
1360          */
1361         if (opt) {
1362                 if (WARN_ON(v6_cork->opt))
1363                         return -EINVAL;
1364
1365                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1366                 if (unlikely(!v6_cork->opt))
1367                         return -ENOBUFS;
1368
1369                 v6_cork->opt->tot_len = sizeof(*opt);
1370                 v6_cork->opt->opt_flen = opt->opt_flen;
1371                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1372
1373                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1374                                                     sk->sk_allocation);
1375                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1376                         return -ENOBUFS;
1377
1378                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1379                                                     sk->sk_allocation);
1380                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1381                         return -ENOBUFS;
1382
1383                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1384                                                    sk->sk_allocation);
1385                 if (opt->hopopt && !v6_cork->opt->hopopt)
1386                         return -ENOBUFS;
1387
1388                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1389                                                     sk->sk_allocation);
1390                 if (opt->srcrt && !v6_cork->opt->srcrt)
1391                         return -ENOBUFS;
1392
1393                 /* need source address above miyazawa*/
1394         }
1395         dst_hold(&rt->dst);
1396         cork->base.dst = &rt->dst;
1397         cork->fl.u.ip6 = *fl6;
1398         v6_cork->hop_limit = ipc6->hlimit;
1399         v6_cork->tclass = ipc6->tclass;
1400         if (rt->dst.flags & DST_XFRM_TUNNEL)
1401                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1402                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1403         else
1404                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1405                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1406         if (np->frag_size < mtu) {
1407                 if (np->frag_size)
1408                         mtu = np->frag_size;
1409         }
1410         cork->base.fragsize = mtu;
1411         cork->base.gso_size = ipc6->gso_size;
1412         cork->base.tx_flags = 0;
1413         cork->base.mark = ipc6->sockc.mark;
1414         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1415
1416         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1417                 cork->base.flags |= IPCORK_ALLFRAG;
1418         cork->base.length = 0;
1419
1420         cork->base.transmit_time = ipc6->sockc.transmit_time;
1421
1422         return 0;
1423 }
1424
1425 static int __ip6_append_data(struct sock *sk,
1426                              struct flowi6 *fl6,
1427                              struct sk_buff_head *queue,
1428                              struct inet_cork *cork,
1429                              struct inet6_cork *v6_cork,
1430                              struct page_frag *pfrag,
1431                              int getfrag(void *from, char *to, int offset,
1432                                          int len, int odd, struct sk_buff *skb),
1433                              void *from, int length, int transhdrlen,
1434                              unsigned int flags, struct ipcm6_cookie *ipc6)
1435 {
1436         struct sk_buff *skb, *skb_prev = NULL;
1437         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1438         struct ubuf_info *uarg = NULL;
1439         int exthdrlen = 0;
1440         int dst_exthdrlen = 0;
1441         int hh_len;
1442         int copy;
1443         int err;
1444         int offset = 0;
1445         u32 tskey = 0;
1446         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1447         struct ipv6_txoptions *opt = v6_cork->opt;
1448         int csummode = CHECKSUM_NONE;
1449         unsigned int maxnonfragsize, headersize;
1450         unsigned int wmem_alloc_delta = 0;
1451         bool paged, extra_uref = false;
1452
1453         skb = skb_peek_tail(queue);
1454         if (!skb) {
1455                 exthdrlen = opt ? opt->opt_flen : 0;
1456                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1457         }
1458
1459         paged = !!cork->gso_size;
1460         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1461         orig_mtu = mtu;
1462
1463         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1464             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1465                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1466
1467         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1468
1469         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1470                         (opt ? opt->opt_nflen : 0);
1471
1472         headersize = sizeof(struct ipv6hdr) +
1473                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1474                      (dst_allfrag(&rt->dst) ?
1475                       sizeof(struct frag_hdr) : 0) +
1476                      rt->rt6i_nfheader_len;
1477
1478         if (mtu <= fragheaderlen ||
1479             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1480                 goto emsgsize;
1481
1482         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1483                      sizeof(struct frag_hdr);
1484
1485         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1486          * the first fragment
1487          */
1488         if (headersize + transhdrlen > mtu)
1489                 goto emsgsize;
1490
1491         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1492             (sk->sk_protocol == IPPROTO_UDP ||
1493              sk->sk_protocol == IPPROTO_RAW)) {
1494                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1495                                 sizeof(struct ipv6hdr));
1496                 goto emsgsize;
1497         }
1498
1499         if (ip6_sk_ignore_df(sk))
1500                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1501         else
1502                 maxnonfragsize = mtu;
1503
1504         if (cork->length + length > maxnonfragsize - headersize) {
1505 emsgsize:
1506                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1507                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1508                 return -EMSGSIZE;
1509         }
1510
1511         /* CHECKSUM_PARTIAL only with no extension headers and when
1512          * we are not going to fragment
1513          */
1514         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1515             headersize == sizeof(struct ipv6hdr) &&
1516             length <= mtu - headersize &&
1517             (!(flags & MSG_MORE) || cork->gso_size) &&
1518             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1519                 csummode = CHECKSUM_PARTIAL;
1520
1521         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1522                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1523                 if (!uarg)
1524                         return -ENOBUFS;
1525                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1526                 if (rt->dst.dev->features & NETIF_F_SG &&
1527                     csummode == CHECKSUM_PARTIAL) {
1528                         paged = true;
1529                 } else {
1530                         uarg->zerocopy = 0;
1531                         skb_zcopy_set(skb, uarg, &extra_uref);
1532                 }
1533         }
1534
1535         /*
1536          * Let's try using as much space as possible.
1537          * Use MTU if total length of the message fits into the MTU.
1538          * Otherwise, we need to reserve fragment header and
1539          * fragment alignment (= 8-15 octects, in total).
1540          *
1541          * Note that we may need to "move" the data from the tail
1542          * of the buffer to the new fragment when we split
1543          * the message.
1544          *
1545          * FIXME: It may be fragmented into multiple chunks
1546          *        at once if non-fragmentable extension headers
1547          *        are too large.
1548          * --yoshfuji
1549          */
1550
1551         cork->length += length;
1552         if (!skb)
1553                 goto alloc_new_skb;
1554
1555         while (length > 0) {
1556                 /* Check if the remaining data fits into current packet. */
1557                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1558                 if (copy < length)
1559                         copy = maxfraglen - skb->len;
1560
1561                 if (copy <= 0) {
1562                         char *data;
1563                         unsigned int datalen;
1564                         unsigned int fraglen;
1565                         unsigned int fraggap;
1566                         unsigned int alloclen, alloc_extra;
1567                         unsigned int pagedlen;
1568 alloc_new_skb:
1569                         /* There's no room in the current skb */
1570                         if (skb)
1571                                 fraggap = skb->len - maxfraglen;
1572                         else
1573                                 fraggap = 0;
1574                         /* update mtu and maxfraglen if necessary */
1575                         if (!skb || !skb_prev)
1576                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1577                                                     fragheaderlen, skb, rt,
1578                                                     orig_mtu);
1579
1580                         skb_prev = skb;
1581
1582                         /*
1583                          * If remaining data exceeds the mtu,
1584                          * we know we need more fragment(s).
1585                          */
1586                         datalen = length + fraggap;
1587
1588                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1589                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1590                         fraglen = datalen + fragheaderlen;
1591                         pagedlen = 0;
1592
1593                         alloc_extra = hh_len;
1594                         alloc_extra += dst_exthdrlen;
1595                         alloc_extra += rt->dst.trailer_len;
1596
1597                         /* We just reserve space for fragment header.
1598                          * Note: this may be overallocation if the message
1599                          * (without MSG_MORE) fits into the MTU.
1600                          */
1601                         alloc_extra += sizeof(struct frag_hdr);
1602
1603                         if ((flags & MSG_MORE) &&
1604                             !(rt->dst.dev->features&NETIF_F_SG))
1605                                 alloclen = mtu;
1606                         else if (!paged &&
1607                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1608                                   !(rt->dst.dev->features & NETIF_F_SG)))
1609                                 alloclen = fraglen;
1610                         else {
1611                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1612                                 pagedlen = fraglen - alloclen;
1613                         }
1614                         alloclen += alloc_extra;
1615
1616                         if (datalen != length + fraggap) {
1617                                 /*
1618                                  * this is not the last fragment, the trailer
1619                                  * space is regarded as data space.
1620                                  */
1621                                 datalen += rt->dst.trailer_len;
1622                         }
1623
1624                         fraglen = datalen + fragheaderlen;
1625
1626                         copy = datalen - transhdrlen - fraggap - pagedlen;
1627                         if (copy < 0) {
1628                                 err = -EINVAL;
1629                                 goto error;
1630                         }
1631                         if (transhdrlen) {
1632                                 skb = sock_alloc_send_skb(sk, alloclen,
1633                                                 (flags & MSG_DONTWAIT), &err);
1634                         } else {
1635                                 skb = NULL;
1636                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1637                                     2 * sk->sk_sndbuf)
1638                                         skb = alloc_skb(alloclen,
1639                                                         sk->sk_allocation);
1640                                 if (unlikely(!skb))
1641                                         err = -ENOBUFS;
1642                         }
1643                         if (!skb)
1644                                 goto error;
1645                         /*
1646                          *      Fill in the control structures
1647                          */
1648                         skb->protocol = htons(ETH_P_IPV6);
1649                         skb->ip_summed = csummode;
1650                         skb->csum = 0;
1651                         /* reserve for fragmentation and ipsec header */
1652                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1653                                     dst_exthdrlen);
1654
1655                         /*
1656                          *      Find where to start putting bytes
1657                          */
1658                         data = skb_put(skb, fraglen - pagedlen);
1659                         skb_set_network_header(skb, exthdrlen);
1660                         data += fragheaderlen;
1661                         skb->transport_header = (skb->network_header +
1662                                                  fragheaderlen);
1663                         if (fraggap) {
1664                                 skb->csum = skb_copy_and_csum_bits(
1665                                         skb_prev, maxfraglen,
1666                                         data + transhdrlen, fraggap);
1667                                 skb_prev->csum = csum_sub(skb_prev->csum,
1668                                                           skb->csum);
1669                                 data += fraggap;
1670                                 pskb_trim_unique(skb_prev, maxfraglen);
1671                         }
1672                         if (copy > 0 &&
1673                             getfrag(from, data + transhdrlen, offset,
1674                                     copy, fraggap, skb) < 0) {
1675                                 err = -EFAULT;
1676                                 kfree_skb(skb);
1677                                 goto error;
1678                         }
1679
1680                         offset += copy;
1681                         length -= copy + transhdrlen;
1682                         transhdrlen = 0;
1683                         exthdrlen = 0;
1684                         dst_exthdrlen = 0;
1685
1686                         /* Only the initial fragment is time stamped */
1687                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1688                         cork->tx_flags = 0;
1689                         skb_shinfo(skb)->tskey = tskey;
1690                         tskey = 0;
1691                         skb_zcopy_set(skb, uarg, &extra_uref);
1692
1693                         if ((flags & MSG_CONFIRM) && !skb_prev)
1694                                 skb_set_dst_pending_confirm(skb, 1);
1695
1696                         /*
1697                          * Put the packet on the pending queue
1698                          */
1699                         if (!skb->destructor) {
1700                                 skb->destructor = sock_wfree;
1701                                 skb->sk = sk;
1702                                 wmem_alloc_delta += skb->truesize;
1703                         }
1704                         __skb_queue_tail(queue, skb);
1705                         continue;
1706                 }
1707
1708                 if (copy > length)
1709                         copy = length;
1710
1711                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1712                     skb_tailroom(skb) >= copy) {
1713                         unsigned int off;
1714
1715                         off = skb->len;
1716                         if (getfrag(from, skb_put(skb, copy),
1717                                                 offset, copy, off, skb) < 0) {
1718                                 __skb_trim(skb, off);
1719                                 err = -EFAULT;
1720                                 goto error;
1721                         }
1722                 } else if (!uarg || !uarg->zerocopy) {
1723                         int i = skb_shinfo(skb)->nr_frags;
1724
1725                         err = -ENOMEM;
1726                         if (!sk_page_frag_refill(sk, pfrag))
1727                                 goto error;
1728
1729                         if (!skb_can_coalesce(skb, i, pfrag->page,
1730                                               pfrag->offset)) {
1731                                 err = -EMSGSIZE;
1732                                 if (i == MAX_SKB_FRAGS)
1733                                         goto error;
1734
1735                                 __skb_fill_page_desc(skb, i, pfrag->page,
1736                                                      pfrag->offset, 0);
1737                                 skb_shinfo(skb)->nr_frags = ++i;
1738                                 get_page(pfrag->page);
1739                         }
1740                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1741                         if (getfrag(from,
1742                                     page_address(pfrag->page) + pfrag->offset,
1743                                     offset, copy, skb->len, skb) < 0)
1744                                 goto error_efault;
1745
1746                         pfrag->offset += copy;
1747                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1748                         skb->len += copy;
1749                         skb->data_len += copy;
1750                         skb->truesize += copy;
1751                         wmem_alloc_delta += copy;
1752                 } else {
1753                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1754                         if (err < 0)
1755                                 goto error;
1756                 }
1757                 offset += copy;
1758                 length -= copy;
1759         }
1760
1761         if (wmem_alloc_delta)
1762                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1763         return 0;
1764
1765 error_efault:
1766         err = -EFAULT;
1767 error:
1768         net_zcopy_put_abort(uarg, extra_uref);
1769         cork->length -= length;
1770         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1771         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1772         return err;
1773 }
1774
1775 int ip6_append_data(struct sock *sk,
1776                     int getfrag(void *from, char *to, int offset, int len,
1777                                 int odd, struct sk_buff *skb),
1778                     void *from, int length, int transhdrlen,
1779                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1780                     struct rt6_info *rt, unsigned int flags)
1781 {
1782         struct inet_sock *inet = inet_sk(sk);
1783         struct ipv6_pinfo *np = inet6_sk(sk);
1784         int exthdrlen;
1785         int err;
1786
1787         if (flags&MSG_PROBE)
1788                 return 0;
1789         if (skb_queue_empty(&sk->sk_write_queue)) {
1790                 /*
1791                  * setup for corking
1792                  */
1793                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1794                                      ipc6, rt, fl6);
1795                 if (err)
1796                         return err;
1797
1798                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1799                 length += exthdrlen;
1800                 transhdrlen += exthdrlen;
1801         } else {
1802                 fl6 = &inet->cork.fl.u.ip6;
1803                 transhdrlen = 0;
1804         }
1805
1806         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1807                                  &np->cork, sk_page_frag(sk), getfrag,
1808                                  from, length, transhdrlen, flags, ipc6);
1809 }
1810 EXPORT_SYMBOL_GPL(ip6_append_data);
1811
1812 static void ip6_cork_release(struct inet_cork_full *cork,
1813                              struct inet6_cork *v6_cork)
1814 {
1815         if (v6_cork->opt) {
1816                 kfree(v6_cork->opt->dst0opt);
1817                 kfree(v6_cork->opt->dst1opt);
1818                 kfree(v6_cork->opt->hopopt);
1819                 kfree(v6_cork->opt->srcrt);
1820                 kfree(v6_cork->opt);
1821                 v6_cork->opt = NULL;
1822         }
1823
1824         if (cork->base.dst) {
1825                 dst_release(cork->base.dst);
1826                 cork->base.dst = NULL;
1827                 cork->base.flags &= ~IPCORK_ALLFRAG;
1828         }
1829         memset(&cork->fl, 0, sizeof(cork->fl));
1830 }
1831
1832 struct sk_buff *__ip6_make_skb(struct sock *sk,
1833                                struct sk_buff_head *queue,
1834                                struct inet_cork_full *cork,
1835                                struct inet6_cork *v6_cork)
1836 {
1837         struct sk_buff *skb, *tmp_skb;
1838         struct sk_buff **tail_skb;
1839         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1840         struct ipv6_pinfo *np = inet6_sk(sk);
1841         struct net *net = sock_net(sk);
1842         struct ipv6hdr *hdr;
1843         struct ipv6_txoptions *opt = v6_cork->opt;
1844         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1845         struct flowi6 *fl6 = &cork->fl.u.ip6;
1846         unsigned char proto = fl6->flowi6_proto;
1847
1848         skb = __skb_dequeue(queue);
1849         if (!skb)
1850                 goto out;
1851         tail_skb = &(skb_shinfo(skb)->frag_list);
1852
1853         /* move skb->data to ip header from ext header */
1854         if (skb->data < skb_network_header(skb))
1855                 __skb_pull(skb, skb_network_offset(skb));
1856         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1857                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1858                 *tail_skb = tmp_skb;
1859                 tail_skb = &(tmp_skb->next);
1860                 skb->len += tmp_skb->len;
1861                 skb->data_len += tmp_skb->len;
1862                 skb->truesize += tmp_skb->truesize;
1863                 tmp_skb->destructor = NULL;
1864                 tmp_skb->sk = NULL;
1865         }
1866
1867         /* Allow local fragmentation. */
1868         skb->ignore_df = ip6_sk_ignore_df(sk);
1869
1870         *final_dst = fl6->daddr;
1871         __skb_pull(skb, skb_network_header_len(skb));
1872         if (opt && opt->opt_flen)
1873                 ipv6_push_frag_opts(skb, opt, &proto);
1874         if (opt && opt->opt_nflen)
1875                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1876
1877         skb_push(skb, sizeof(struct ipv6hdr));
1878         skb_reset_network_header(skb);
1879         hdr = ipv6_hdr(skb);
1880
1881         ip6_flow_hdr(hdr, v6_cork->tclass,
1882                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1883                                         ip6_autoflowlabel(net, np), fl6));
1884         hdr->hop_limit = v6_cork->hop_limit;
1885         hdr->nexthdr = proto;
1886         hdr->saddr = fl6->saddr;
1887         hdr->daddr = *final_dst;
1888
1889         skb->priority = sk->sk_priority;
1890         skb->mark = cork->base.mark;
1891
1892         skb->tstamp = cork->base.transmit_time;
1893
1894         skb_dst_set(skb, dst_clone(&rt->dst));
1895         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1896         if (proto == IPPROTO_ICMPV6) {
1897                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1898
1899                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1900                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1901         }
1902
1903         ip6_cork_release(cork, v6_cork);
1904 out:
1905         return skb;
1906 }
1907
1908 int ip6_send_skb(struct sk_buff *skb)
1909 {
1910         struct net *net = sock_net(skb->sk);
1911         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1912         int err;
1913
1914         err = ip6_local_out(net, skb->sk, skb);
1915         if (err) {
1916                 if (err > 0)
1917                         err = net_xmit_errno(err);
1918                 if (err)
1919                         IP6_INC_STATS(net, rt->rt6i_idev,
1920                                       IPSTATS_MIB_OUTDISCARDS);
1921         }
1922
1923         return err;
1924 }
1925
1926 int ip6_push_pending_frames(struct sock *sk)
1927 {
1928         struct sk_buff *skb;
1929
1930         skb = ip6_finish_skb(sk);
1931         if (!skb)
1932                 return 0;
1933
1934         return ip6_send_skb(skb);
1935 }
1936 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1937
1938 static void __ip6_flush_pending_frames(struct sock *sk,
1939                                        struct sk_buff_head *queue,
1940                                        struct inet_cork_full *cork,
1941                                        struct inet6_cork *v6_cork)
1942 {
1943         struct sk_buff *skb;
1944
1945         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1946                 if (skb_dst(skb))
1947                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1948                                       IPSTATS_MIB_OUTDISCARDS);
1949                 kfree_skb(skb);
1950         }
1951
1952         ip6_cork_release(cork, v6_cork);
1953 }
1954
1955 void ip6_flush_pending_frames(struct sock *sk)
1956 {
1957         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1958                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1959 }
1960 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1961
1962 struct sk_buff *ip6_make_skb(struct sock *sk,
1963                              int getfrag(void *from, char *to, int offset,
1964                                          int len, int odd, struct sk_buff *skb),
1965                              void *from, int length, int transhdrlen,
1966                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1967                              struct rt6_info *rt, unsigned int flags,
1968                              struct inet_cork_full *cork)
1969 {
1970         struct inet6_cork v6_cork;
1971         struct sk_buff_head queue;
1972         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1973         int err;
1974
1975         if (flags & MSG_PROBE)
1976                 return NULL;
1977
1978         __skb_queue_head_init(&queue);
1979
1980         cork->base.flags = 0;
1981         cork->base.addr = 0;
1982         cork->base.opt = NULL;
1983         cork->base.dst = NULL;
1984         v6_cork.opt = NULL;
1985         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1986         if (err) {
1987                 ip6_cork_release(cork, &v6_cork);
1988                 return ERR_PTR(err);
1989         }
1990         if (ipc6->dontfrag < 0)
1991                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1992
1993         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1994                                 &current->task_frag, getfrag, from,
1995                                 length + exthdrlen, transhdrlen + exthdrlen,
1996                                 flags, ipc6);
1997         if (err) {
1998                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1999                 return ERR_PTR(err);
2000         }
2001
2002         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2003 }